bug fix: installer & registry

bug fix: crash on concurrency
bug fix with registry
26 changed files with 892 additions and 248 deletions
--- a/bin/transport
+++ b/bin/transport
@ -24,19 +24,28 @@ from multiprocessing import Process

 import os
 import transport
-from transport import etl
+# from transport import etl
+from transport.iowrapper import IETL
 # from transport import providers
 import typer
 from typing_extensions import Annotated
 from typing import Optional
 import time
 from termcolor import colored
+from enum import Enum
+from rich import print
+import plugin_ix as pix
+

 app = typer.Typer()
+app_e = typer.Typer()   #-- handles etl (run, generate)
+app_x = typer.Typer()   #-- handles plugins (list,add, test)
+app_i = typer.Typer()   #-- handles information (version, license)
+app_r = typer.Typer()   #-- handles registry    
 REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport'])
 REGISTRY_FILE= 'transport-registry.json'
-CHECK_MARK = ' '.join(['[',colored(u'\u2713', 'green'),']'])
-TIMES_MARK= ' '.join(['[',colored(u'\u2717','red'),']'])
+CHECK_MARK = '[ [green]\u2713[/green] ]' #' '.join(['[',colored(u'\u2713', 'green'),']'])
+TIMES_MARK= '[ [red]\u2717[/red] ]' #' '.join(['[',colored(u'\u2717','red'),']'])
 # @app.command()
 def help() :     
 	print (__doc__)
@ -44,10 +53,11 @@ def wait(jobs):
    while jobs :
        jobs = [thread for thread in jobs if thread.is_alive()]
        time.sleep(1)
-
-@app.command(name="apply")
+@app_e.command(name="run")
 def apply (path:Annotated[str,typer.Argument(help="path of the configuration file")],
-        index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed")):
+        index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed"),
+        batch:int = typer.Option(default=5, help="The number of parallel processes to run at once")
+        ):
    """
    This function applies data transport ETL feature to read data from one source to write it one or several others
    """
@ -56,23 +66,34 @@ def apply (path:Annotated[str,typer.Argument(help="path of the configuration fil
        file = open(path)
        _config = json.loads (file.read() )
        file.close()
-        if index :
+        if index is not None:            
            _config = [_config[ int(index)]]
-        jobs = []            
+        jobs = []          
        for _args in _config :
-            pthread = etl.instance(**_args) #-- automatically starts the process
+            # pthread = etl.instance(**_args) #-- automatically starts the process
+            def bootup ():
+                _worker = IETL(**_args)
+                _worker.run()
+            pthread = Process(target=bootup)
+            pthread.start()
            jobs.append(pthread)
+            if len(jobs) == batch :
+                wait(jobs)
+                jobs = []
+        
+        if jobs :
+            wait (jobs)
        #
-        # @TODO: Log the number of processes started and estimated time
-        while jobs :
-             jobs = [pthread for pthread in jobs if pthread.is_alive()]
-             time.sleep(1)
+        # @TODO: Log the number of processes started and estfrom transport impfrom transport impimated time
+        # while jobs :
+        #      jobs = [pthread for pthread in jobs if pthread.is_alive()]
+        #      time.sleep(1)
        #
        # @TODO: Log the job termination here ...
-@app.command(name="providers")
+@app_i.command(name="supported")
 def supported (format:Annotated[str,typer.Argument(help="format of the output, supported formats are (list,table,json)")]="table") :
    """
-    This function will print supported providers/vendors and their associated classifications
+    This function will print supported database technologies
    """
    _df =  (transport.supported())
    if format in ['list','json'] :
@ -80,17 +101,26 @@ def supported (format:Annotated[str,typer.Argument(help="format of the output, s
    else:
         print (_df)
    print ()
-
-@app.command()
-def version():
+@app_i.command(name="version")
+def version ():
+    """
+    This function will return the version of the data-transport
+    """
+    print()
+    print (f'[bold] {transport.__app_name__} ,[blue] {transport.__edition__} edition [/blue], version {transport.__version__}[/bold]')
+    print ()
+ 
+@app_i.command(name="license")
+def info():
    """
    This function will display version and license information
    """
-
-    print (transport.__app_name__,'version ',transport.__version__)
+    print()
+    print (f'[bold] {transport.__app_name__} ,{transport.__edition__}, version {transport.__version__}[/bold]')
+    print ()
    print (transport.__license__)

-@app.command()
+@app_e.command()
 def generate (path:Annotated[str,typer.Argument(help="path of the ETL configuration file template (name included)")]):
    """
    This function will generate a configuration template to give a sense of how to create one
@ -99,45 +129,45 @@ def generate (path:Annotated[str,typer.Argument(help="path of the ETL configurat
            {
                "source":{"provider":"http","url":"https://raw.githubusercontent.com/codeforamerica/ohana-api/master/data/sample-csv/addresses.csv"},
                "target":
-            [{"provider":"files","path":"addresses.csv","delimiter":","},{"provider":"sqlite","database":"sample.db3","table":"addresses"}]
+            [{"provider":"files","path":"addresses.csv","delimiter":","},{"provider":"sqlite3","database":"sample.db3","table":"addresses"}]
            }
            ]
    file = open(path,'w')
    file.write(json.dumps(_config))
    file.close()
-    print (f"""{CHECK_MARK} Successfully generated a template ETL file at {path}""" )
+    print (f"""{CHECK_MARK} Successfully generated a template ETL file at [bold]{path}[/bold]""" )
    print ("""NOTE: Each line (source or target) is the content of an auth-file""")



-@app.command(name="init")
+@app_r.command(name="reset")
 def initregistry (email:Annotated[str,typer.Argument(help="email")],
                  path:str=typer.Option(default=REGISTRY_PATH,help="path or location of the configuration file"), 
                  override:bool=typer.Option(default=False,help="override existing configuration or not")):
    """
-    This functiion will initialize the registry and have both application and calling code loading the database parameters by a label
+    This functiion will initialize the data-transport registry and have both application and calling code loading the database parameters by a label

    """
    try:
        transport.registry.init(email=email, path=path, override=override)
-        _msg = f"""{CHECK_MARK} Successfully wrote configuration to {path} from {email}"""
+        _msg = f"""{CHECK_MARK} Successfully wrote configuration to [bold]{path}[/bold] from [bold]{email}[/bold]"""
    except Exception as e:
        _msg = f"{TIMES_MARK} {e}"
    print (_msg)
    print ()
-@app.command(name="register")
+@app_r.command(name="add")
 def register (label:Annotated[str,typer.Argument(help="unique label that will be used to load the parameters of the database")],
              auth_file:Annotated[str,typer.Argument(help="path of the auth_file")],
              default:bool=typer.Option(default=False,help="set the auth_file as default"),
              path:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry file")):
    """
-    This function will register an auth-file i.e database connection and assign it a label, 
-    Learn more about auth-file at https://healthcareio.the-phi.com/data-transport
+    This function add  a database label for a given auth-file. which allows access to the database using a label of your choice.
+    
    """
    try:
        if transport.registry.exists(path) :
            transport.registry.set(label=label,auth_file=auth_file, default=default, path=path)
-            _msg = f"""{CHECK_MARK} Successfully added label "{label}" to data-transport registry"""
+            _msg = f"""{CHECK_MARK} Successfully added label [bold]"{label}"[/bold] to data-transport registry"""
        else:
            _msg = f"""{TIMES_MARK} Registry is not initialized, please initialize the registry (check help)"""
    except Exception as e:
@ -145,6 +175,68 @@ def register (label:Annotated[str,typer.Argument(help="unique label that will be
    print (_msg)
    
    pass
+@app_x.command(name='add') 
+def register_plugs (
+    alias:Annotated[str,typer.Argument(help="unique function name within a file")],
+    path:Annotated[str,typer.Argument(help="path of the python file, that contains functions")],
+    folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry folder"),
+    
+    ):
+    """
+    This function will register a file and the functions within we are interested in using
+    """
+    if ',' in alias :
+        alias = [_name.strip() for _name in alias.split(',') if _name.strip() != '' ] 
+    else:
+        alias = [alias.strip()]
+    _pregistry  = pix.Registry(folder=folder,plugin_folder='plugins/code')
+    _log = _pregistry.set(path,alias)
+    # transport.registry.plugins.init()
+    # _log = transport.registry.plugins.add(alias,path)
+    _mark = TIMES_MARK if not _log else CHECK_MARK
+    _msg  = f"""Could NOT add the [bold]{alias}[/bold]to the registry""" if not _log else f""" successfully added {alias}, {_log} functions registered"""
+    print (f"""{_mark} {_msg}""")
+@app_x.command(name="list") 
+def registry_list (folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport configuration folder")):
+    """
+    This function will list all the plugins (python functions/files) that are registered and can be reused
+    """
+    _pregistry  = pix.Registry(folder=folder)
+    _df = _pregistry.stats()
+    if _df.empty :
+        print (f"{TIMES_MARK} registry at {folder} is not ready")
+    else:
+        print (_df)
+
+@app_x.command ("has")
+def registry_has (alias:Annotated[str,typer.Argument(help="alias of a function function@file or file.function")],
+                  folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry file")) :
+    _pregistry  = pix.Registry(folder=folder)
+    if _pregistry.has(alias) :
+        _msg = f"{CHECK_MARK} {alias} was [bold] found [/bold] in registry "
+    else:
+        _msg = f"{TIMES_MARK} {alias} was [bold] NOT found [/bold] in registry "
+    print (_msg)
+    
+@app_x.command(name="test") 
+def registry_test (alias:Annotated[str,typer.Argument(help="alias of a function function@file or file.function")],
+                  folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry folder")) :
+    _pregistry  = pix.Registry(folder=folder)
+    """
+    This function allows to test syntax for a plugin i.e in terms of alias@function
+    """
+    # _item = transport.registry.plugins.has(key=key)
+    _pointer = _pregistry.get(alias) if _pregistry.has(alias) else None
+        
+    if _pointer:
+        print (f"""{CHECK_MARK} successfully loaded [bold] {alias}[/bold] found in {folder}""")
+        
+    else:
+        print (f"{TIMES_MARK} unable to load {alias}. Make sure it is registered")
+app.add_typer(app_e,name='etl',help="This function will run etl or generate a template etl configuration file")
+app.add_typer(app_r,name='registry',help='This function allows labeling database access information')
+app.add_typer(app_i,name="info",help="This function will print either license or supported database technologies")
+app.add_typer(app_x, name="plugins",help="This function enables add/list/test of plugins in the registry")
 if __name__ == '__main__' :
     app()
 	
--- a/info/init.py
+++ b/info/init.py
@ -1,6 +1,7 @@
 __app_name__  = 'data-transport'
 __author__ = 'The Phi Technology'
-__version__= '2.2.6'
+__version__= '2.4.30'
+__edition__= 'enterprise'
 __email__  = "info@the-phi.com"
 __license__=f"""
 Copyright 2010 - 2024, Steve L. Nyemba
@ -13,9 +14,12 @@ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR I

 """

-__whatsnew__=f"""version {__version__}, focuses on collaborative environments like jupyter-base servers (apache zeppelin; jupyter notebook, jupyterlab, jupyterhub)
+__whatsnew__=f"""version {__version__}, 
+1. Added support for read/write logs as well as plugins (when applied)
+2. Bug fix with duckdb (adding readonly) for readers because there are issues with threads & processes
+3. support for streaming data, important to use this with large volumes of data

-    1. simpler syntax to create readers/writers
-    2. auth-file registry that can be referenced using a label
-    3. duckdb support
+
+
+    
 """
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,63 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "data-transport"
+dynamic = ["version"]
+authors = [
+    {name="Steve L. Nyemba" , email = "info@the-phi.com"},
+]
+description = ""
+readme = "README.md"
+license = {text = "LICENSE"}
+keywords = ["mongodb","duckdb","couchdb","rabbitmq","file","read","write","s3","sqlite"]
+classifiers = [
+ "License :: OSI Approved :: MIT License",
+    "Topic :: Utilities",
+]
+dependencies = [
+    "termcolor","sqlalchemy", "aiosqlite","duckdb-engine",
+    "mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite",
+    "typer","pandas","numpy","sqlalchemy","pyarrow",
+    "plugin-ix@git+https://github.com/lnyemba/plugins-ix"
+]
+[project.optional-dependencies]
+sql         = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite"]
+nosql       = ["pymongo","cloudant"]
+cloud       = ["pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore"]
+warehouse   = ["pydrill","pyspark","sqlalchemy_drill"]
+rabbitmq    = ["pika"]
+sqlite      = ["aiosqlite"]
+aws3        = ["boto3","boto","botocore"]
+nextcloud   =  ["pyncclient"]
+mongodb     = ["pymongo"]
+netezza     = ["nzpy"]
+mysql       = ["mysql-connector-python"]
+postgresql  = ["psycopg2-binary"]
+sqlserver   = ["pymssql"]
+http       = ["flask-session"]
+all         = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite","pymongo","cloudant","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore","pydrill","pyspark","sqlalchemy_drill", "pika","aiosqlite","boto3","boto","botocore", "pyncclient"]
+
+[project.urls]
+Homepage = "https://healthcareio.the-phi.com/git/code/transport.git"
+
+#[project.scripts]
+#transport = "transport:main"
+
+[tool.setuptools]
+include-package-data = true
+zip-safe = false
+script-files = ["bin/transport"]
+
+[tool.setuptools.packages.find]
+include = ["info","info.*", "transport", "transport.*"]
+
+[tool.setuptools.dynamic]
+version = {attr = "info.__version__"}
+#authors = {attr = "meta.__author__"}
+
+# If you have a info.py file, you might also want to include the author dynamically:
+# [tool.setuptools.dynamic]
+# version = {attr = "info.__version__"}
+# authors = {attr = "info.__author__"}
--- a/setup.py
+++ b/setup.py
@ -1,28 +0,0 @@
-"""
-This is a build file for the 
-"""
-from setuptools import setup, find_packages
-import os
-import sys
-# from version import __version__,__author__
-from info import __version__, __author__,__app_name__,__license__
-
-
-def read(fname):
-    return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args    = {
-    "name":__app_name__,
-    "version":__version__,
-    "author":__author__,"author_email":"info@the-phi.com",
-    "license":__license__,
-    # "packages":["transport","info","transport/sql"]},
-
-    "packages": find_packages(include=['info','transport', 'transport.*'])}
-args["keywords"]=['mongodb','duckdb','couchdb','rabbitmq','file','read','write','s3','sqlite']
-args["install_requires"] = ['pyncclient','duckdb-engine','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','termcolor','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql']
-args["url"] =   "https://healthcareio.the-phi.com/git/code/transport.git"
-args['scripts'] = ['bin/transport']
-# if sys.version_info[0] == 2 :
-#     args['use_2to3'] = True
-#     args['use_2to3_exclude_fixers']=['lib2to3.fixes.fix_import']
-setup(**args)
--- a/transport/init.py
+++ b/transport/init.py
@ -18,31 +18,36 @@ Source Code is available under MIT License:
 """
 import numpy as np

-from transport import sql, nosql, cloud, other
+from transport import sql, nosql, cloud, other, warehouse
 import pandas as pd
 import json
 import os
-from info import __version__,__author__,__email__,__license__,__app_name__,__whatsnew__
+from info import __version__,__author__,__email__,__license__,__app_name__,__whatsnew__,__edition__
 from transport.iowrapper import IWriter, IReader, IETL
 from transport.plugins import PluginLoader
 from transport import providers
 import copy 
 from transport import registry
-
+from transport.plugins import Plugin 
 PROVIDERS = {}

 def init():
    global PROVIDERS
-    for _module in [cloud,sql,nosql,other] :
+    for _module in [cloud,sql,nosql,other,warehouse] :
        for _provider_name in dir(_module) :
            if _provider_name.startswith('__') or _provider_name == 'common':
                continue
            PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__}
-def _getauthfile (path) :
-    f = open(path)
-    _object = json.loads(f.read())
-    f.close()
-    return _object
+    #
+    # loading the registry
+    if not registry.isloaded() :
+        registry.load()
+
+# def _getauthfile (path) :
+#     f = open(path)
+#     _object = json.loads(f.read())
+#     f.close()
+#     return _object
 def instance (**_args):
    """
    This function returns an object of to read or write from a supported database provider/vendor
@ -52,16 +57,7 @@ def instance (**_args):
    kwargs      These are arguments that are provider/vendor specific
    """
    global PROVIDERS
-    # if not registry.isloaded () :
-    #     if ('path' in _args and registry.exists(_args['path'] )) or registry.exists():
-    #         registry.load() if 'path' not in _args else registry.load(_args['path'])
-    #         print ([' GOT IT'])
-    # if 'label' in _args and registry.isloaded():
-    #     _info = registry.get(_args['label'])
-    #     if _info :
-    #         #
-    #         _args = dict(_args,**_info)
-
+    
    if 'auth_file' in _args:
        if os.path.exists(_args['auth_file']) :
            #
@ -78,7 +74,7 @@ def instance (**_args):
            filename = _args['auth_file']
            raise Exception(f" {filename} was not found or is invalid")
    if 'provider' not in _args and 'auth_file' not in _args :
-        if not registry.isloaded () :
+        if not registry.isloaded () : 
            if ('path' in _args and registry.exists(_args['path'] )) or registry.exists():
                registry.load() if 'path' not in _args else registry.load(_args['path'])
        _info = {}
@ -87,8 +83,6 @@ def instance (**_args):
        else:
            _info = registry.get()    
        if _info :
-            #
-            # _args = dict(_args,**_info)
            _args = dict(_info,**_args) #-- we can override the registry parameters with our own arguments

    if 'provider' in _args and _args['provider'] in PROVIDERS :
@ -119,8 +113,32 @@ def instance (**_args):
        #         for _delegate in _params :
        #             loader.set(_delegate)
        
-        loader = None if 'plugins' not in _args else _args['plugins']
-        return IReader(_agent,loader) if _context == 'read' else IWriter(_agent,loader)
+        _plugins = None if 'plugins' not in _args else _args['plugins']
+        
+        # if registry.has('logger') :
+        #     _kwa = registry.get('logger')
+        #     _lmodule = getPROVIDERS[_kwa['provider']]
+            
+        if ( ('label' in _args and _args['label'] != 'logger') and registry.has('logger')):
+            #
+            # We did not request label called logger, so we are setting up a logger if it is specified in the registry
+            #
+            _kwargs = registry.get('logger')
+            _kwargs['context']  = 'write'
+            _kwargs['table']    =_module.__name__.split('.')[-1]+'_logs'
+            # _logger = instance(**_kwargs)
+            _module = PROVIDERS[_kwargs['provider']]['module']
+            _logger = getattr(_module,'Writer')
+            _logger = _logger(**_kwargs)
+        else:
+            _logger = None
+        
+        _kwargs = {'agent':_agent,'plugins':_plugins,'logger':_logger}
+        if 'args' in _args :
+            _kwargs['args'] = _args['args']
+        # _datatransport =  IReader(_agent,_plugins,_logger) if _context == 'read' else IWriter(_agent,_plugins,_logger)
+        _datatransport =  IReader(**_kwargs) if _context == 'read' else IWriter(**_kwargs)
+        return _datatransport

    else:
        #
@ -137,7 +155,14 @@ class get :
        if not _args or ('provider' not in _args and 'label' not in _args):
            _args['label'] = 'default'
        _args['context'] = 'read'
-        return instance(**_args)
+        # return instance(**_args)
+        # _args['logger'] = instance(**{'label':'logger','context':'write','table':'logs'})
+        
+        _handler =  instance(**_args)
+        # _handler.setLogger(get.logger())
+        return _handler
+
+    
    @staticmethod
    def writer(**_args):
        """
@ -146,10 +171,26 @@ class get :
        if not _args or ('provider' not in _args and 'label' not in _args):
            _args['label'] = 'default'
        _args['context'] = 'write'
-        return instance(**_args)
+        # _args['logger'] = instance(**{'label':'logger','context':'write','table':'logs'})
+
+        _handler =  instance(**_args)
+        #
+        # Implementing logging with the 'eat-your-own-dog-food' approach
+        # Using dependency injection to set the logger (problem with imports)
+        #
+        # _handler.setLogger(get.logger())
+        return _handler
+    @staticmethod
+    def logger ():
+        if registry.has('logger') :
+            _args = registry.get('logger')
+            _args['context']  = 'write'
+            return instance(**_args)
+        return None
    @staticmethod
    def etl (**_args):
        if 'source' in _args and 'target' in _args :
+
            return IETL(**_args)
        else:
            raise Exception ("Malformed input found, object must have both 'source' and 'target' attributes")
--- a/transport/cloud/bigquery.py
+++ b/transport/cloud/bigquery.py
@ -15,6 +15,7 @@ import time

 MAX_CHUNK = 2000000
 class BigQuery:
+    __template__= {"private_key":None,"dataset":None,"table":None}
    def __init__(self,**_args):
        path = _args['service_key'] if 'service_key' in _args else _args['private_key']
        self.credentials = service_account.Credentials.from_service_account_file(path)
--- a/transport/cloud/databricks.py
+++ b/transport/cloud/databricks.py
@ -26,6 +26,7 @@ class Bricks:
    :cluster_path
    :table
    """
+    __template__ = {"host":None,"token":None,"cluster_path":None,"catalog":None,"schema":None}
    def __init__(self,**_args):
        _host = _args['host']
        _token= _args['token']
--- a/transport/cloud/nextcloud.py
+++ b/transport/cloud/nextcloud.py
@ -10,6 +10,7 @@ import json
 import nextcloud_client as nextcloud

 class Nextcloud :
+    __template__={"url":None,"token":None,"uid":None,"file":None}
    def __init__(self,**_args):
        pass
        self._delimiter = None
--- a/transport/cloud/s3.py
+++ b/transport/cloud/s3.py
@ -24,6 +24,7 @@ class s3 :
 	"""
 		@TODO: Implement a search function for a file given a bucket??
 	"""
+	__template__={"access_key":None,"secret_key":None,"bucket":None,"file":None,"region":None}
 	def __init__(self,**args) :
 		"""
 			This function will extract a file or set of files from s3 bucket provided
--- a/transport/duck.py
+++ b/transport/duck.py
@ -1,19 +0,0 @@
-"""
-This file will be intended to handle duckdb database
-"""
-
-import duckdb
-from transport.common import Reader,Writer
-
-class Duck(Reader):
-    def __init__(self,**_args):
-        super().__init__(**_args)
-        self._path = None if 'path' not in _args else _args['path']
-        self._handler = duckdb.connect() if not self._path else duckdb.connect(self._path)
-
-
-class DuckReader(Duck) :
-    def __init__(self,**_args):
-        super().__init__(**_args)
-    def read(self,**_args) :
-        pass
--- a/transport/iowrapper.py
+++ b/transport/iowrapper.py
@ -5,40 +5,103 @@ NOTE: Plugins are converted to a pipeline, so we apply a pipeline when reading o
        - upon initialization we will load plugins
        - on read/write we apply a pipeline (if passed as an argument)
 """    
-from transport.plugins import plugin, PluginLoader
+from transport.plugins import PluginLoader
 import transport
 from transport import providers
-from multiprocessing import Process
+from multiprocessing import Process, RLock
 import time
+import types
+from . import registry
+from datetime import datetime
+import pandas as pd
+import numpy as np
+import os
+import sys
+import itertools
+import json

+import plugin_ix

-class IO:
+class BaseIO :
+    def __init__(self,**_args):
+        
+        self._logger = _args['logger'] if 'logger' in _args else None
+        self._logTable = 'logs' if 'logTable' not in _args else _args['logTable']
+
+    def setLogger(self,_logger):
+        self._logger = _logger        
+    def log (self,**_args):
+        
+        if self._logger :
+            _date = str(datetime.now())                                    
+            _data = dict({'pid':os.getpid(),'date':_date[:10],'time':_date[11:19]},**_args)
+            for key in _data :
+                if type(_data[key]) == list :
+                    _data[key] = [_item.__name__ if type(_item).__name__== 'function' else _item for _item in _data[key]]
+                    
+                _data[key] = str(_data[key]) if type(_data[key]) not in [list,dict] else json.dumps(_data[key])
+            
+            self._logger.write(pd.DataFrame([_data])) #,table=self._logTable)
+    
+class IO(BaseIO):
    """
    Base wrapper class for read/write and support for logs
    """
-    def __init__(self,_agent,plugins):
+    def __init__(self,**_args):
+        
+        #
+        # We need to initialize the logger here ...
+        #
+        super().__init__(**_args)
+        _agent  = _args['agent']
+        plugins = _args['plugins'] if 'plugins' else None
+        # _logger = _args['logger'] if 'logger' in _args else None
+        # self._logger = _logger if not type(_agent) in [IReader,IWriter] else _agent._logger #transport.get.writer(label='logger') #if registry.has('logger') else None
+        # if not _logger and hasattr(_agent,'_logger') :
+        #     self._logger = getattr(_agent,'_logger')
        self._agent = _agent
+        _date = _date = str(datetime.now())
+        self._ixloader = plugin_ix.Loader (registry=plugin_ix.Registry(folder=transport.registry.REGISTRY_PATH))
+        
+        # self._logTable = 'logs' #'_'.join(['logs',_date[:10]+_date[11:19]]).replace(':','').replace('-','_')
+        
        if plugins :
-            self._init_plugins(plugins)
-        else:
-            self._plugins = None
+            self.init_plugins(plugins)
+    # def setLogger(self,_logger):
+    #     self._logger = _logger        
+    # def log (self,**_args):
+    #     if self._logger :
+    #         _date = str(datetime.now())                                    
+    #         _data = dict({'pid':os.getpid(),'date':_date[:10],'time':_date[11:19]},**_args)
+    #         for key in _data :
+    #             if type(_data[key]) == list :
+    #                 _data[key] = [_item.__name__ if type(_item).__name__== 'function' else _item for _item in _data[key]]
+                    
+    #             _data[key] = str(_data[key]) if type(_data[key]) not in [list,dict] else json.dumps(_data[key])
+            
+    #         self._logger.write(pd.DataFrame([_data])) #,table=self._logTable)
+    # def _init_plugins(self,_items):
+    #     """
+    #     This function will load pipelined functions as a plugin loader
+    #     """
+    #     registry.plugins.init()
+    #     self._plugins = PluginLoader(registry=registry.plugins)
+    #     [self._plugins.set(_name) for _name in _items]
        
-    def _init_plugins(self,_args):
-        """
-        This function will load pipelined functions as a plugin loader
-        """
-        if 'path' in _args and 'names' in _args :
-            self._plugins = PluginLoader(**_args)
-        else:
-            self._plugins = PluginLoader()
-            [self._plugins.set(_pointer) for _pointer in _args]
-        #
-        # @TODO: We should have a way to log what plugins are loaded and ready to use
+    #     self.log(action='init-plugins',object=self.getClassName(self),input =[_name for _name in _items])
+    #     # if 'path' in _args and 'names' in _args :
+    #     #     self._plugins = PluginLoader(**_args)
+    #     # else:
+    #     #     self._plugins = PluginLoader(registry=registry.plugins)
+    #     #     [self._plugins.set(_pointer) for _pointer in _args]
+    #     #
+    #     # @TODO: We should have a way to log what plugins are loaded and ready to use
    def meta (self,**_args):
        if hasattr(self._agent,'meta') :
            return self._agent.meta(**_args)
        return []
-
+    def getClassName (self,_object):
+        return '.'.join([_object.__class__.__module__,_object.__class__.__name__])
    def close(self):
        if hasattr(self._agent,'close') :
            self._agent.close()
@ -48,6 +111,7 @@ class IO:
        """
        for _pointer in self._plugins :
            _data = _pointer(_data)
+            time.sleep(1)
    def apply(self,_query):
        if hasattr(self._agent,'apply') :
            return self._agent.apply(_query)
@ -59,62 +123,183 @@ class IO:
            pointer = getattr(self._agent,_name)
            return pointer(_query)
        return None
+    def init_plugins(self,plugins):
+        for _ref in plugins :
+            self._ixloader.set(_ref)
+    
 class IReader(IO):
    """
    This is a wrapper for read functionalities
    """
-    def __init__(self,_agent,pipeline=None):
-        super().__init__(_agent,pipeline)
+    def __init__(self,**_args):
+        super().__init__(**_args)
+        self._args = _args['args']if 'args' in _args else None
+    def _stream (self,_data ):
+        # self.log(action='streaming',object=self._agent._engine.name, input= type(_data).__name__)
+        _shape = []
+        for _segment in _data :     
+            _shape += list(_segment.shape)
+            if self._plugins :
+                # yield self._plugins.apply(_segment,self.log)
+                yield self._ixloader.visitor(_data,self.log)
+            else:
+                yield _segment
+        _objectName = '.'.join([self._agent.__class__.__module__,self._agent.__class__.__name__])
+        
+        _input = {'shape':_shape}
+        if hasattr(self._agent,'_table') :
+            _input['table'] = self._agent._table
+        self.log(action='streaming',object=_objectName, input= _input)
+        
    def read(self,**_args):
+        
        if 'plugins' in _args :
-            self._init_plugins(_args['plugins'])
-        _data = self._agent.read(**_args)
-        if self._plugins and self._plugins.ratio() > 0 :
-            _data = self._plugins.apply(_data)
-        #
-        # output data 
-        return _data
+            self.init_plugins(_args['plugins'])
+
+        if self._args :
+            _data = self._agent.read(**self._args)
+        else:
+            _data = self._agent.read(**_args)
+        
+        _objectName = '.'.join([self._agent.__class__.__module__,self._agent.__class__.__name__])
+        if types.GeneratorType == type(_data):
+            return self._stream(_data)
+            # if self._plugins :
+            #     return self._stream(_data) 
+            # else:
+            #     _count = 0
+            #     for _segment in _data :
+            #         _count += 1
+            #         yield _segment
+            #     self.log(action='streaming',object=_objectName, input= {'segments':_count})
+                # return _data
+        elif type(_data) == pd.DataFrame :
+            _shape = _data.shape #[0,0] if not _data.shape[] else list(_data.shape)
+            _input = {'shape':_shape}
+            if hasattr(self._agent,'_table') :
+                _input['table'] = self._agent._table
+            
+            self.log(action='read',object=_objectName, input=_input)
+            _data = self._ixloader.visitor(_data)
+            return _data
+            
 class IWriter(IO):
-    def __init__(self,_agent,pipeline=None):
-        super().__init__(_agent,pipeline)  
+    lock = RLock()
+    def __init__(self,**_args):
+        super().__init__(**_args)
    def write(self,_data,**_args):
        if 'plugins' in _args :
            self._init_plugins(_args['plugins'])
-        if self._plugins and self._plugins.ratio() > 0 :
-            _data = self._plugins.apply(_data)
-
-        self._agent.write(_data,**_args)
+        # if self._plugins and self._plugins.ratio() > 0 :
+        #     _logs = []
+        #     _data = self._plugins.apply(_data,_logs,self.log)
+            
+            # [self.log(**_item) for _item in _logs]
+        try:
+            # IWriter.lock.acquire()
+            _data = self._ixloader.visitor(_data)
+            self._agent.write(_data,**_args)
+        finally:
+            # IWriter.lock.release()
+            pass

 #
 # The ETL object in its simplest form is an aggregation of read/write objects
 # @TODO: ETL can/should aggregate a writer as a plugin and apply it as a process

-class IETL(IReader) :
+class IETL(BaseIO) :
    """
    This class performs an ETL operation by ineriting a read and adding writes as pipeline functions
    """
    def __init__(self,**_args):
-        super().__init__(transport.get.reader(**_args['source']))
-        if 'target' in _args:
-            self._targets = _args['target'] if type(_args['target']) == list else [_args['target']]
-        else:
-            self._targets = []
-        self.jobs = []
+        # _source = _args['source']
+        # _plugins = _source['plugins'] if 'plugins' in _source else None
+        
+        # # super().__init__(transport.get.reader(**_args['source']))
+        # super().__init__(agent=transport.get.reader(**_source),plugins=_plugins)
+        # # _logger = 
+        # if 'target' in _args:
+        #     self._targets = _args['target'] if type(_args['target']) == list else [_args['target']]
+        # else:
+        #     self._targets = []
+        # self.jobs = []
+        
+        # #
+        # # If the parent is already multiprocessing
+        # if 'token' in _source :
+        #     self._logToken = _source['token']
+        # self._sourceArgs = _source['args'] if 'args' in _source else None
+        # self._hasParentProcess = False if 'hasParentProcess' not in _args else _args['hasParentProcess']
+        super().__init__()
+        self._source = _args['source']
+        self._targets= _args['target'] if _args['target'] == list else [_args['target']]
        #
-        # If the parent is already multiprocessing
-        self._hasParentProcess = False if 'hasParentProcess' not in _args else _args['hasParentProcess']
-    def read(self,**_args):
-        _data = super().read(**_args)
-
-        for _kwargs in self._targets :
-            self.post(_data,**_kwargs)
-
+        # ETL Initialization, we should provide some measure of context ...
+        #
+        
+    # def run(self) :
+    #     """
+    #     We should apply the etl here, if we are in multiprocessing mode
+    #     """
+    #     return self.read()
+    def run(self,**_args):
+        # _data = super().read(**_args) if not self._sourceArgs else super().read(**self._sourceArgs)
+        # self._targets = [transport.get.writer(**_kwargs) for _kwargs in self._targets]
+        
+        _reader = transport.get.reader(**self._source)
+        if hasattr(_reader,'_logger') :
+            self.setLogger(_reader._logger)
+        
+        self.log(action='init-etl',input={'source':self._source,'target':self._targets})
+        _data = _reader.read(**self._source['args'])if 'args' in self._source else _reader.read()
+        _reader.close()
+        _writers = [transport.get.writer(**_kwargs) for _kwargs in self._targets]
+        _schema = [] if not getattr(_reader._agent,'_table') else _reader.meta()
+        if types.GeneratorType == type(_data):
+            _index = 0
+            for _segment in _data :
+                _index += 1
+                for _writer in _writers :
+                    self.post(_segment,writer=_writer,index=_index,schema=_schema)
+                    time.sleep(1)
+            
+        else:
+            
+            for _writer in _writers :
+                self.post(_data,writer=_writer,schema=_schema)
+        #     pass
        return _data
+        # return _data
    def post (self,_data,**_args) :
        """
        This function returns an instance of a process that will perform the write operation
        :_args  parameters associated with writer object
        """
-        writer = transport.get.writer(**_args)
-        writer.write(_data)
-        writer.close()
+        #writer = transport.get.writer(**_args)
+        _input = {}
+        try:
+            _action = 'post'
+            _shape = dict(zip(['rows','columns'],_data.shape))
+            _index = _args['index'] if 'index' in _args else 0
+            
+            writer = _args['writer']
+            _schema= _args['schema']
+            #
+            # -- things to log
+            _input = {'shape':_shape,'segment':_index}
+            if hasattr(writer._agent,'_table'):
+                _input['table'] = writer._agent._table
+ 
+            for _item in _schema :
+                if _item['type'] in ['INTEGER','BIGINT','INT'] :
+                    _column  = _item['name']
+                    _data[_column] = _data[_column].copy().fillna(0).astype(np.int64)
+            writer.write(_data,schema=_schema)
+        except Exception as e:
+            _action = 'post-error'
+            _input['error'] = str(e)
+            print ([e])
+            pass
+        
+        self.log(action=_action,object=writer._agent.__module__, input= _input)
+
--- a/transport/nosql/couchdb.py
+++ b/transport/nosql/couchdb.py
@ -19,6 +19,7 @@ class Couch:
 		@param	doc		user id involved
 		@param	dbname		database name (target)
 	"""
+	__template__={"url":None,"doc":None,"dbname":None,"username":None,"password":None}
 	def __init__(self,**args):
 		url 		= args['url'] if 'url' in args else 'http://localhost:5984'
 		self._id 	= args['doc']
--- a/transport/nosql/mongodb.py
+++ b/transport/nosql/mongodb.py
@ -25,6 +25,7 @@ class Mongo :
    """
    Basic mongodb functions are captured here
    """
+    __template__={"db":None,"collection":None,"host":None,"port":None,"username":None,"password":None}
    def __init__(self,**args):
        """
            :dbname     database name/identifier
--- a/transport/other/callback.py
+++ b/transport/other/callback.py
@ -9,7 +9,7 @@ import numpy as np
 import pandas as pd

 class Writer :
-    lock = Lock()
+    lock = None
    _queue = {'default':queue.Queue()}
    def __init__(self,**_args):
        self._cache     = {}
--- a/transport/other/files.py
+++ b/transport/other/files.py
@ -12,7 +12,7 @@ class File :
 		"""
 		self.path 		= params['path'] if 'path' in params else None
 		self.delimiter	= params['delimiter'] if 'delimiter' in params else ','
-		
+		self._chunksize = None if 'chunksize' not in params else int(params['chunksize'])
 	def isready(self):
 		return os.path.exists(self.path) 
 	def meta(self,**_args):
@ -26,11 +26,15 @@ class Reader (File):
 	
 	def __init__(self,**_args):
 		super().__init__(**_args)
-
+	def _stream(self,path) :
+		reader = pd.read_csv(path,sep=self.delimiter,chunksize=self._chunksize,low_memory=False)
+		for segment in reader :
+			yield segment
 	def read(self,**args):
 		_path = self.path if 'path' not in args else args['path']
 		_delimiter = self.delimiter if 'delimiter' not in args else args['delimiter']
-		return pd.read_csv(_path,delimiter=self.delimiter)
+		
+		return pd.read_csv(_path,sep=self.delimiter) if not self._chunksize else self._stream(_path)
 	def stream(self,**args):
 		raise Exception ("streaming needs to be implemented")
 class Writer (File):
--- a/transport/plugins/init.py
+++ b/transport/plugins/init.py
@ -11,8 +11,10 @@ import importlib as IL
 import importlib.util
 import sys
 import os
+import pandas as pd
+import time

-class plugin :
+class Plugin :
    """
    Implementing function decorator for data-transport plugins (post-pre)-processing
    """
@ -22,8 +24,9 @@ class plugin :
        :mode   restrict to reader/writer
        :about  tell what the function is about    
        """
-        self._name = _args['name']
-        self._about = _args['about']
+        self._name = _args['name'] if 'name' in _args else None
+        self._version = _args['version'] if 'version' in _args else '0.1'
+        self._doc = _args['doc'] if 'doc' in _args else "N/A"
        self._mode = _args['mode'] if 'mode' in _args else 'rw'
    def __call__(self,pointer,**kwargs):
        def wrapper(_args,**kwargs):
@ -32,57 +35,64 @@ class plugin :
        # @TODO:
        # add attributes to the wrapper object
        #
+        self._name = pointer.__name__ if not self._name else self._name
        setattr(wrapper,'transport',True)
        setattr(wrapper,'name',self._name)
-        setattr(wrapper,'mode',self._mode)
-        setattr(wrapper,'about',self._about)
+        setattr(wrapper,'version',self._version)
+        setattr(wrapper,'doc',self._doc)
        return wrapper

-
 class PluginLoader :
    """
    This class is intended to load a plugin and make it available and assess the quality of the developed plugin
    """
+   
    def __init__(self,**_args):
        """
-        :path   location of the plugin (should be a single file)
-        :_names of functions to load
        """
-        _names = _args['names'] if 'names' in _args else None
-        path = _args['path'] if 'path' in _args else None
-        self._names = _names if type(_names) == list else [_names]
+        # _names = _args['names'] if 'names' in _args else None
+        # path = _args['path'] if 'path' in _args else None
+        # self._names = _names if type(_names) == list else [_names]
        self._modules = {}
        self._names = []
-        if path and os.path.exists(path) and _names:
-            for _name in self._names :
-                
-                spec = importlib.util.spec_from_file_location('private', path)
-                module = importlib.util.module_from_spec(spec)
-                spec.loader.exec_module(module) #--loads it into sys.modules
-                if hasattr(module,_name) :
-                    if self.isplugin(module,_name) :
-                        self._modules[_name] = getattr(module,_name)
-                    else:
-                        print ([f'Found {_name}', 'not plugin'])
-                else:
-                    #
-                    # @TODO: We should log this somewhere some how
-                    print (['skipping ',_name, hasattr(module,_name)])
-                    pass
-        else:
-            #
-            # Initialization is empty
-            self._names = []
+        self._registry = _args['registry']
+
        pass
-    def set(self,_pointer) :
+    def load (self,**_args):
+        self._modules = {}
+        self._names = []
+        path = _args ['path']
+        if os.path.exists(path) :
+            _alias = path.split(os.sep)[-1]
+            spec = importlib.util.spec_from_file_location(_alias, path)
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module) #--loads it into sys.modules
+            for _name in dir(module) :
+                if self.isplugin(module,_name) :
+                    self._module[_name] = getattr(module,_name)
+                    # self._names [_name]
+    def format (self,**_args):
+        uri = _args['alias'],_args['name']
+    # def set(self,_pointer) :
+    def set(self,_key) :
        """
        This function will set a pointer to the list of modules to be called
        This should be used within the context of using the framework as a library
        """
-        _name = _pointer.__name__
+        if type(_key).__name__ == 'function':
+            #
+            # The pointer is in the code provided by the user and loaded in memory
+            #
+            _pointer = _key
+            _key = 'inline@'+_key.__name__
+            # self._names.append(_key.__name__)
+        else:
+            _pointer = self._registry.get(key=_key)
+
+        if _pointer  :
+            self._modules[_key] = _pointer
+            self._names.append(_key)
        
-        self._modules[_name] = _pointer
-        self._names.append(_name)
    def isplugin(self,module,name):
        """
        This function determines if a module is a recognized plugin
@ -107,12 +117,31 @@ class PluginLoader :

        _n = len(self._names)
        return len(set(self._modules.keys()) & set (self._names)) / _n
-    def apply(self,_data):
+    def apply(self,_data,_logger=[]):
+        _input= {}
+        
        for _name in self._modules :
-            _pointer = self._modules[_name]
-            #
-            # @TODO: add exception handling
-            _data = _pointer(_data)
+            try:
+                _input = {'action':'plugin','object':_name,'input':{'status':'PASS'}}
+                _pointer = self._modules[_name]
+                if type(_data) == list :
+                    _data = pd.DataFrame(_data)
+                _brow,_bcol = list(_data.shape) 
+                
+                #
+                # @TODO: add exception handling
+                _data = _pointer(_data)
+                
+                _input['input']['shape'] = {'rows-dropped':_brow - _data.shape[0]}
+            except Exception as e:
+                _input['input']['status'] = 'FAILED'
+                print (e)
+            time.sleep(1)
+            if _logger:
+                try:
+                    _logger(**_input)
+                except Exception as e:
+                    pass    
        return _data
    # def apply(self,_data,_name):
    #     """
--- a/transport/providers/init.py
+++ b/transport/providers/init.py
@ -11,7 +11,7 @@ BIGQUERY	='bigquery'
 FILE 	= 'file'
 ETL = 'etl'

-SQLITE = 'sqlite'
+SQLITE = 'sqlite3'
 SQLITE3= 'sqlite3'
 DUCKDB = 'duckdb'

@ -44,7 +44,9 @@ PGSQL	= POSTGRESQL

 AWS_S3  = 's3'
 RABBIT = RABBITMQ
-
-
+ICEBERG='iceberg'
+APACHE_ICEBERG = 'iceberg'
+DRILL = 'drill'
+APACHE_DRILL = 'drill'
 # QLISTENER = 'qlistener'
    
--- a/transport/registry.py
+++ b/transport/registry.py
@ -3,51 +3,59 @@ import json
 from info import __version__
 import copy
 import transport
+import importlib
+import importlib.util
+import shutil
+from io import StringIO

 """
 This class manages data from the registry and allows (read only)
@TODO: add property to the DATA attribute
 """

-REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport'])
+if 'HOME' in os.environ :
+    REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport'])
+else:
+    REGISTRY_PATH=os.sep.join([os.environ['USERPROFILE'],'.data-transport'])
+
 #
 # This path can be overriden by an environment variable ...
 #
 if 'DATA_TRANSPORT_REGISTRY_PATH' in os.environ :
    REGISTRY_PATH = os.environ['DATA_TRANSPORT_REGISTRY_PATH']
 REGISTRY_FILE= 'transport-registry.json'
-
 DATA = {}

 def isloaded ():
    return DATA not in [{},None]
-def exists (path=REGISTRY_PATH) :
+def exists (path=REGISTRY_PATH,_file=REGISTRY_FILE) :
    """
    This function determines if there is a registry at all
    """
    p = os.path.exists(path)
-    q = os.path.exists( os.sep.join([path,REGISTRY_FILE]))
+    q = os.path.exists( os.sep.join([path,_file]))
    
    return p and q
-def load (_path=REGISTRY_PATH):
+def load (_path=REGISTRY_PATH,_file=REGISTRY_FILE):
    global DATA
    
    if exists(_path) :
-        path = os.sep.join([_path,REGISTRY_FILE])
+        path = os.sep.join([_path,_file])
        f = open(path)
        DATA = json.loads(f.read())
        f.close()
-def init (email,path=REGISTRY_PATH,override=False):
+def init (email,path=REGISTRY_PATH,override=False,_file=REGISTRY_FILE):
    """
    Initializing the registry and will raise an exception in the advent of an issue
    """
    p = '@' in email
-    q = False if '.' not in email else email.split('.')[-1] in ['edu','com','io','ai','org']
+    #q = False if '.' not in email else email.split('.')[-1] in ['edu','com','io','ai','org']
+    q = len(email.split('.')[-1]) in [2,3]
    if p and q :
        _config = {"email":email,'version':__version__}
        if not os.path.exists(path):
            os.makedirs(path)
-        filename = os.sep.join([path,REGISTRY_FILE])
+        filename = os.sep.join([path,_file])
        if not os.path.exists(filename) or override == True :

            f = open(filename,'w')
@ -62,6 +70,8 @@ def init (email,path=REGISTRY_PATH,override=False):
 def lookup (label):
    global DATA
    return label in DATA
+has = lookup 
+
 def get (label='default') :
    global DATA
    return copy.copy(DATA[label]) if label in DATA else {}
@ -73,8 +83,11 @@ def set (label, auth_file, default=False,path=REGISTRY_PATH) :
    if label == 'default' :
        raise Exception ("""Invalid label name provided, please change the label name and use the switch""")
    reg_file = os.sep.join([path,REGISTRY_FILE])
-    if os.path.exists (auth_file) and os.path.exists(path) and os.path.exists(reg_file):
-        f = open(auth_file)
+    if os.path.exists(path) and os.path.exists(reg_file):
+        if type(auth_file) == str and os.path.exists (auth_file) :
+            f = open(auth_file)
+        elif type(auth_file) == StringIO:
+            f = auth_file
        _info = json.loads(f.read())
        f.close()
        f = open(reg_file)
--- a/transport/sql/init.py
+++ b/transport/sql/init.py
@ -3,7 +3,7 @@ This namespace/package wrap the sql functionalities for a certain data-stores
    - netezza, postgresql, mysql and sqlite
    - mariadb, redshift (also included)
 """
-from . import postgresql, mysql, netezza, sqlite, sqlserver, duckdb
+from . import postgresql, mysql, netezza, sqlite3, sqlserver, duckdb


 #
@ -11,7 +11,7 @@ from . import postgresql, mysql, netezza, sqlite, sqlserver, duckdb
 #
 mariadb     = mysql
 redshift    = postgresql
-sqlite3     = sqlite
+# sqlite3     = sqlite


 # from transport import sql
--- a/transport/sql/common.py
+++ b/transport/sql/common.py
@ -3,7 +3,7 @@ This file encapsulates common operations associated with SQL databases via SQLAl

 """
 import sqlalchemy as sqa
-from sqlalchemy import text 
+from sqlalchemy import text , MetaData, inspect

 import pandas as pd

@ -13,7 +13,13 @@ class Base:
        self._port = None
        self._database = _args['database']
        self._table = _args['table'] if 'table' in _args else None
-        self._engine= sqa.create_engine(self._get_uri(**_args),future=True)
+        _uri = self._get_uri(**_args)
+        if type(_uri) == str :
+            self._engine= sqa.create_engine(_uri,future=True)
+        else:
+            
+            _uri,_kwargs = _uri
+            self._engine= sqa.create_engine(_uri,**_kwargs,future=True)
    def _set_uri(self,**_args) :
        """
        :provider   provider
@ -34,21 +40,34 @@ class Base:
        :table  optional name of the table (can be fully qualified)
        """
        _table = self._table if 'table' not in _args else _args['table']
+        _map = {'TINYINT':'INTEGER','BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'}
        _schema = []
-        if _table :
-            if sqa.__version__.startswith('1.') :
-                _handler = sqa.MetaData(bind=self._engine)
-                _handler.reflect()
-            else:
-                #
-                # sqlalchemy's version 2.+
-                _handler = sqa.MetaData()
-                _handler.reflect(bind=self._engine)
-            #
-            # Let us extract the schema with the native types
-            _map = {'BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'}
-            _schema = [{"name":_attr.name,"type":_map.get(str(_attr.type),str(_attr.type))} for _attr in _handler.tables[_table].columns]
-        return _schema
+        # if _table :
+        #     if sqa.__version__.startswith('1.') :
+        #         _handler = sqa.MetaData(bind=self._engine)
+        #         _handler.reflect()
+        #     else:
+        #         #
+        #         # sqlalchemy's version 2.+
+        #         _handler = sqa.MetaData()
+        #         _handler.reflect(bind=self._engine)
+        #     #
+        #     # Let us extract the schema with the native types
+        #     _map = {'BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'}
+        #     _schema = [{"name":_attr.name,"type":_map.get(str(_attr.type),str(_attr.type))} for _attr in _handler.tables[_table].columns]
+        #
+        try:
+            if _table :
+                _inspector = inspect(self._engine)
+                _columns = _inspector.get_columns(_table)
+                _schema = [{'name':column['name'],'type':_map.get(str(column['type']),str(column['type'])) } for column in _columns]
+                return _schema
+        except Exception as e:
+            pass
+
+        # else:
+        return []
+
    def  has(self,**_args):
        return self.meta(**_args)
    def apply(self,sql):
@ -58,12 +77,11 @@ class Base:

        @TODO: Execution of stored procedures
        """
-        if sql.lower().startswith('select') or sql.lower().startswith('with') :
-
+        if sql.strip().lower().startswith('select') or sql.strip().lower().startswith('with') or sql.strip().startswith('show'):
            return pd.read_sql(sql,self._engine) 
        else:
            _handler = self._engine.connect()
-            _handler.execute(text(sql))
+            _handler.execute(text(sql.strip()))
            _handler.commit ()
            _handler.close()
        return None
@ -71,6 +89,7 @@ class Base:
 class SQLBase(Base):
    def __init__(self,**_args):
        super().__init__(**_args)
+        self._schema = _args.get('schema',None)
    def get_provider(self):
        raise Exception ("Provider Needs to be set ...")
    def get_default_port(self) :
@ -94,7 +113,11 @@ class SQLBase(Base):
        # _uri = [_item.strip() for _item in _uri if _item.strip()]
        # return '/'.join(_uri)
        return f'{_provider}://{_host}/{_database}' if _account == '' else f'{_provider}://{_account}{_host}/{_database}'
-
+    def close(self,) :
+        try:
+            self._engine.dispose()
+        except :
+            pass
 class BaseReader(SQLBase):
    def __init__(self,**_args):
        super().__init__(**_args)    
@ -106,6 +129,8 @@ class BaseReader(SQLBase):
            sql = _args['sql']
        else:
            _table = _args['table'] if 'table' in _args else self._table
+            if self._schema and type(self._schema) == str :
+                _table = f'{self._schema}.{_table}'
            sql = f'SELECT * FROM {_table}'
        return self.apply(sql)
    
@ -118,7 +143,7 @@ class BaseWriter (SQLBase):
        super().__init__(**_args)
    def write(self,_data,**_args):
        if type(_data) == dict :
-            _df = pd.DataFrame(_data)
+            _df = pd.DataFrame([_data])
        elif type(_data) == list :
            _df = pd.DataFrame(_data)
        else:
@ -135,5 +160,8 @@ class BaseWriter (SQLBase):
        #     _mode['schema'] = _args['schema']
        # if 'if_exists' in _args :
        #     _mode['if_exists'] = _args['if_exists']
-
-        _df.to_sql(_table,self._engine,**_mode)
+        if 'schema' in _args and type(_args['schema']) == str:
+            self._schema = _args.get('schema',None)
+        if self._schema :
+           _mode['schema'] = self._schema
+        _df.to_sql(_table,self._engine,**_mode)    
--- a/transport/sql/duckdb.py
+++ b/transport/sql/duckdb.py
@ -15,9 +15,11 @@ class Duck :
    def _get_uri(self,**_args):
        return f"""duckdb:///{self.database}"""
 class Reader(Duck,BaseReader) :
-    def __init__(self,**_args):
+    def __init__(self,**_args):        
        Duck.__init__(self,**_args)
        BaseReader.__init__(self,**_args)
+    def _get_uri(self,**_args):
+        return super()._get_uri(**_args),{'connect_args':{'read_only':True}}
 class Writer(Duck,BaseWriter):
    def __init__(self,**_args):
        Duck.__init__(self,**_args)
--- a/transport/sql/sqlite3.py
+++ b/transport/sql/sqlite3.py
@ -1,7 +1,9 @@
 import sqlalchemy
 import pandas as pd
 from transport.sql.common import Base, BaseReader, BaseWriter
-class SQLite (BaseReader):
+from multiprocessing import RLock
+class SQLite3 :
+    lock = RLock()
    def __init__(self,**_args):
        super().__init__(**_args)
        if 'path' in _args :
@ -12,7 +14,7 @@ class SQLite (BaseReader):
        path = self._database
        return f'sqlite:///{path}' # ensure this is the correct path for the sqlite file. 

-class Reader(SQLite,BaseReader):
+class Reader(SQLite3,BaseReader):
    def __init__(self,**_args):
        super().__init__(**_args)
    # def read(self,**_args):
@ -20,6 +22,12 @@ class Reader(SQLite,BaseReader):
    #     return pd.read_sql(sql,self._engine)


-class Writer (SQLite,BaseWriter):
+class Writer (SQLite3,BaseWriter):
    def __init__(self,**_args):
-        super().__init__(**_args)
+        super().__init__(**_args)
+    def write(self,_data,**_kwargs):
+        try:
+            SQLite3.lock.acquire()
+            super().write(_data,**_kwargs)
+        finally:
+            SQLite3.lock.release()
--- a/transport/sql/sqlserver.py
+++ b/transport/sql/sqlserver.py
@ -7,6 +7,7 @@ from transport.sql.common import Base, BaseReader, BaseWriter


 class MsSQLServer:
+    
    def __init__(self,**_args) :
        super().__init__(**_args)
        pass
--- a/transport/warehouse/init.py
+++ b/transport/warehouse/init.py
@ -0,0 +1,7 @@
+"""
+This namespace/package is intended to handle read/writes against data warehouse solutions like :
+    - apache iceberg
+    - clickhouse (...)
+"""
+
+from . import iceberg, drill
--- a/transport/warehouse/drill.py
+++ b/transport/warehouse/drill.py
@ -0,0 +1,55 @@
+import sqlalchemy
+import pandas as pd
+from .. sql.common import BaseReader , BaseWriter
+import sqlalchemy as sqa
+
+class Drill :
+    __template = {'host':None,'port':None,'ssl':None,'table':None,'database':None}
+    def __init__(self,**_args):
+
+        self._host = _args['host'] if 'host' in _args else 'localhost'
+        self._port = _args['port'] if 'port' in _args else self.get_default_port()
+        self._ssl = False if 'ssl' not in _args else _args['ssl']
+        
+        self._table = _args['table'] if 'table' in _args else None
+        if self._table and '.' in self._table :
+            _seg = self._table.split('.')
+            if len(_seg) > 2 :
+                self._schema,self._database = _seg[:2]
+        else:
+            
+            self._database=_args['database']
+            self._schema = self._database.split('.')[0]
+        
+    def _get_uri(self,**_args):
+        return f'drill+sadrill://{self._host}:{self._port}/{self._database}?use_ssl={self._ssl}'
+    def get_provider(self):
+        return "drill+sadrill"
+    def get_default_port(self):
+        return "8047"
+    def meta(self,**_args):
+        _table = _args['table'] if 'table' in _args else self._table
+        if '.' in _table :
+            _schema = _table.split('.')[:2]
+            _schema = '.'.join(_schema)
+            _table = _table.split('.')[-1]
+        else:
+            _schema = self._schema
+        
+        # _sql = f"select COLUMN_NAME AS name, CASE WHEN DATA_TYPE ='CHARACTER VARYING' THEN 'CHAR ( 125 )' ELSE DATA_TYPE END AS type from INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA='{_schema}' and TABLE_NAME='{_table}'"
+        _sql = f"select COLUMN_NAME AS name, CASE WHEN DATA_TYPE ='CHARACTER VARYING' THEN 'CHAR ( '||COLUMN_SIZE||' )' ELSE DATA_TYPE END AS type from INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA='{_schema}' and TABLE_NAME='{_table}'"
+        try:
+            _df  = pd.read_sql(_sql,self._engine)
+            return _df.to_dict(orient='records')
+        except Exception as e:
+            print (e)
+            pass
+        return []
+class Reader (Drill,BaseReader) :
+    def __init__(self,**_args):
+        super().__init__(**_args)
+        self._chunksize = 0 if 'chunksize' not in _args else _args['chunksize']
+        self._engine= sqa.create_engine(self._get_uri(),future=True)
+class Writer(Drill,BaseWriter):
+    def __init__(self,**_args):
+        super().__init__(self,**_args)
--- a/transport/warehouse/iceberg.py
+++ b/transport/warehouse/iceberg.py
@ -0,0 +1,151 @@
+"""
+dependency:
+    - spark and SPARK_HOME environment variable must be set
+NOTE:
+    When using streaming option, insure that it is inline with default (1000 rows) or increase it in spark-defaults.conf
+
+"""
+from pyspark.sql import SparkSession
+from pyspark import SparkContext
+from pyspark.sql.types import *
+from pyspark.sql.functions import col, to_date, to_timestamp
+import copy
+
+class Iceberg :
+    def __init__(self,**_args):
+        """
+        providing catalog meta information (you must get this from apache iceberg)
+        """
+        #
+        # Turning off logging (it's annoying & un-professional)
+        #
+        # _spconf = SparkContext()
+        # _spconf.setLogLevel("ERROR")
+        #
+        # @TODO:
+        #   Make arrangements for additional configuration elements 
+        #
+        self._session = SparkSession.builder.appName("data-transport").getOrCreate()
+        self._session.conf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS")
+        # self._session.sparkContext.setLogLevel("ERROR")
+        self._catalog = self._session.catalog
+        self._table = _args['table'] if 'table' in _args else None
+        
+        if 'catalog' in _args :
+            #
+            # Let us set the default catalog
+            self._catalog.setCurrentCatalog(_args['catalog'])
+            
+        else:
+            # No current catalog has been set ...
+            pass
+        if 'database' in _args :
+            self._database = _args['database']
+            self._catalog.setCurrentDatabase(self._database)
+        else:
+            #
+            # Should we set the default as the first one if available ?
+            #
+            pass
+        self._catalogName = self._catalog.currentCatalog()
+        self._databaseName = self._catalog.currentDatabase()
+    def meta (self,**_args) :
+        """
+        This function should return the schema of a table (only)
+        """
+        _schema = []
+        try:
+            _table = _args['table'] if 'table' in _args else self._table
+            _tableName = self._getPrefix(**_args) + f".{_table}"
+            _tmp = self._session.table(_tableName).schema
+            _schema = _tmp.jsonValue()['fields']
+            for _item in _schema :
+                del _item['nullable'],_item['metadata']
+        except Exception as e:
+            
+            pass
+        return _schema
+    def _getPrefix (self,**_args):        
+        _catName = self._catalogName if 'catalog' not in _args else _args['catalog']
+        _datName = self._databaseName if 'database' not in _args else _args['database']
+        
+        return '.'.join([_catName,_datName])
+    def apply(self,_query):
+        """
+        sql query/command to run against apache iceberg
+        """
+        return self._session.sql(_query).toPandas()
+    def has (self,**_args):
+        try:
+            _prefix = self._getPrefix(**_args)
+            if _prefix.endswith('.') :
+                return False
+            return _args['table'] in [_item.name for _item in self._catalog.listTables(_prefix)]
+        except Exception as e:
+            print (e)
+            return False
+    
+    def close(self):
+        self._session.stop()
+class Reader(Iceberg) :
+    def __init__(self,**_args):
+        super().__init__(**_args)
+    def read(self,**_args):
+        _table = self._table
+        _prefix = self._getPrefix(**_args)        
+        if 'table' in _args or _table:
+            _table = _args['table'] if 'table' in _args else _table
+            _table = _prefix + f'.{_table}'
+            return self._session.table(_table).toPandas()
+        else:
+            sql = _args['sql']
+            return self._session.sql(sql).toPandas()
+        pass
+class Writer (Iceberg):
+    """
+    Writing data to an Apache Iceberg data warehouse (using pyspark)
+    """
+    def __init__(self,**_args):
+        super().__init__(**_args)
+        self._mode = 'append' if 'mode' not in _args else _args['mode']
+        self._table = None if 'table' not in _args else _args['table']
+    def format (self,_schema) :
+        _iceSchema = StructType([])
+        _map = {'integer':IntegerType(),'float':DoubleType(),'double':DoubleType(),'date':DateType(),
+                'timestamp':TimestampType(),'datetime':TimestampType(),'string':StringType(),'varchar':StringType()}
+        for _item in _schema :
+            _name = _item['name']
+            _type = _item['type'].lower()
+            if _type not in _map :
+                _iceType = StringType()
+            else:
+                _iceType = _map[_type]
+            
+            _iceSchema.add (StructField(_name,_iceType,True))
+        return _iceSchema if len(_iceSchema) else []
+    def write(self,_data,**_args):
+        _prefix = self._getPrefix(**_args)
+        if 'table' not in _args and not self._table :
+            raise Exception (f"Table Name should be specified for catalog/database {_prefix}")
+        _schema = self.format(_args['schema']) if 'schema' in _args else []
+        if not _schema :
+            rdd = self._session.createDataFrame(_data,verifySchema=False)
+        else :
+            rdd = self._session.createDataFrame(_data,schema=_schema,verifySchema=True)
+        _mode = self._mode if 'mode' not in _args else _args['mode']
+        _table = self._table if 'table' not in _args else _args['table']
+        
+        # print (_data.shape,_mode,_table)
+        
+        if not self._session.catalog.tableExists(_table):
+        #     # @TODO:
+        #     # add partitioning information here 
+            rdd.writeTo(_table).using('iceberg').create()
+            
+        # #     _mode = 'overwrite'
+        # #     rdd.write.format('iceberg').mode(_mode).saveAsTable(_table)
+        else:
+            # rdd.writeTo(_table).append()
+        # #     _table = f'{_prefix}.{_table}'
+
+            rdd.coalesce(10).write.format('iceberg').mode('append').save(_table)
Author	SHA1	Message	Date
Steve Nyemba	e0e48a3d02	bug fix: installer & registry	4 days ago
Steve Nyemba	aba887ec29	bug fix: crash on concurrency	1 week ago
Steve Nyemba	570f2294b9	bug fix with registry	1 week ago
Steve Nyemba	e3efc70c01	upgrade pyproject.toml, bug fix with registry	2 weeks ago
Steve Nyemba	6ffc7ed7b5	bug fix	3 weeks ago
Steve Nyemba	3025e6571b	bug fix	4 weeks ago
Steve Nyemba	a42ee59129	bug fix: logger	2 months ago
Steve Nyemba	b1975d6a42	bug fix: missing table (sql)	3 months ago
Steve Nyemba	c7a5d42f42	bug fix: dependency	4 months ago
Steve Nyemba	2eee726191	bug fix: version update	4 months ago
Steve Nyemba	bf32c54cd4	bug fix: setup file	4 months ago
Steve Nyemba	4fbf2d495a	adding dependency plugin-ix	4 months ago
Steve Nyemba	73fa9d90a9	adding plugin handler (enhancement)	4 months ago
Steve Nyemba	fce888606c	bug fix: double entries (yikes)	4 months ago
Steve Nyemba	2d359db5fa	bug fix & version update, using schemas read/write	5 months ago
Steve Nyemba	cdeebd3ce4	bug fix & version update, using schemas read/write	5 months ago
Steve Nyemba	c1bc167b7f	bug fix & version update	5 months ago
Steve Nyemba	d8dd50ab47	bug fix	5 months ago
Steve Nyemba	a0b0a8a26f	verison update	5 months ago
Steve Nyemba	97c5ae6fb3	bug fix: duckdb readonly, version update with edition	5 months ago
Steve Nyemba	a6da232d5f	bug fix ...	5 months ago
Steve Nyemba	e7df1e967f	bug fix sqlalchemy dispose connection	5 months ago
Steve Nyemba	a4b4a453bb	bug fix: etl & logger	5 months ago
Steve Nyemba	a022bdf92f	bug fix: etl & logger	5 months ago
Steve Nyemba	93537095a4	bug fix: etl & logger	5 months ago
Steve Nyemba	329a575f89	bug fix: issue with sqlalchemy & python 3.12	5 months ago
Steve Nyemba	0dbb0a38e5	bug fix: issue with sqlalchemy & python 3.12	5 months ago
Steve Nyemba	6637405898	bug fix: issue with sqlalchemy & python 3.12	5 months ago
Steve Nyemba	e82145690b	bug fix: version	7 months ago
Steve Nyemba	5c423205c5	bug fixes and enhancements, iceberg casting, typer parameters, etl throtling	7 months ago
Steve Nyemba	b2a2e49858	bug fix	7 months ago
Steve Nyemba	0fd29207cf	bug fixes ...	8 months ago
Steve Nyemba	5ee0186e58	bug fixes ...	8 months ago
Steve Nyemba	541ae41786	bug fix	8 months ago
Steve Nyemba	dc9329218a	bug fixes: read, with source that accepts an sql query	8 months ago
Steve Nyemba	990bb343a4	bug fixes: read, with source that accepts an sql query	8 months ago
Steve Nyemba	bcf25a4e27	bug fixes: drill & iceberg, etl	8 months ago
Steve Nyemba	a2ab60660e	bug fix	8 months ago
Steve Nyemba	92db89daaf	bug fix	8 months ago
Steve Nyemba	a28848194a	bug fixes: stream, drill,iceberg	8 months ago
Steve Nyemba	5dbe541025	adding templates to the class hierarchies, helps with wizard	8 months ago
Steve Nyemba	ea1cb7b1bb	bug fixes with ETL and added properties to perform parameter validation and provide input template	8 months ago
Steve Nyemba	8904c7184a	warehouse support, plugin registry and streaming	9 months ago
Steve Nyemba	4e97b32530	feat: streaming support on reads	9 months ago
Steve Nyemba	a1cf78a889	bug fixes: drill inheritance and met data function	9 months ago
Steve Nyemba	685aac7d6b	bug fix: write when table doesn't exist	9 months ago
Steve Nyemba	07be81bace	adding warehouse support (iceberg)	9 months ago