Compare commits

...

63 Commits

Author SHA1 Message Date
Steve L. Nyemba 56e10454ed Merge pull request 'v2.2.0' (#35) from v2.2.0 into master
5 days ago
Steve L. Nyemba 2ffb775c3d Merge branch 'master' into v2.2.0
5 days ago
Steve Nyemba 89d762f39a bug fixes: conditional imports
5 days ago
Steve Nyemba 6e753a1fcd bug fixes
5 days ago
Steve Nyemba 18c54d7664 bug fixes
5 days ago
Steve Nyemba f06d26f9b6 bug fixes:installer & imports
5 days ago
Steve L. Nyemba 8fdcbce42d Merge pull request 'v2.2.0' (#34) from v2.2.0 into master
5 days ago
Steve Nyemba be10ae17d7 bug fixes: installer & registry
5 days ago
Steve Nyemba befdf453f5 bug fix: crash with etl & process
1 week ago
Steve L. Nyemba b461ce9d7b Merge pull request 'v2.2.0' (#33) from v2.2.0 into master
1 week ago
Steve Nyemba fbdb4a4931 bug fix: registry and emails
1 week ago
Steve Nyemba 6e1c420952 project file specification
2 weeks ago
Steve Nyemba 66d881fdda upgrade pyproject.toml, bug fix with registry
2 weeks ago
Steve L. Nyemba 6c26588462 Merge pull request 'v2.2.0' (#32) from v2.2.0 into master
3 weeks ago
Steve Nyemba de4e065ca6 bug fix with newer setuptools
3 weeks ago
Steve Nyemba e035f5eba0 windows bug fix, environment variable
3 weeks ago
Steve Nyemba 6f8019f582 bug fix
4 weeks ago
Steve L. Nyemba d3517a5720 Merge pull request 'bug fix: logger issue' (#31) from v2.2.0 into master
1 month ago
Steve Nyemba b0cd0b85dc bug fix: logger issue
2 months ago
Steve L. Nyemba 4c98e81c14 Merge pull request 'v2.2.0: bug fixes' (#30) from v2.2.0 into master
3 months ago
Steve Nyemba 4b34c746ae bug fix: missing table
3 months ago
Steve Nyemba 0977ad1b18 setup fixes
4 months ago
Steve Nyemba 98ef8a848e bug fixes and dependencies
4 months ago
Steve Nyemba 469c6f89a2 fixes with plugin handler
4 months ago
Steve Nyemba dd10f6db78 bug fix: version & cli
5 months ago
Steve Nyemba dad2956a8c version update
5 months ago
Steve Nyemba eaa2b99a2d bug fix: schema (postgresql) construct
5 months ago
Steve Nyemba a1b5f2743c bug fixes ...
5 months ago
Steve Nyemba afa442ea8d versioning update edition
5 months ago
Steve Nyemba 30645e46bd bug fix: readonly for duckdb
5 months ago
Steve Nyemba cdf783143e ...
5 months ago
Steve Nyemba 1a8112f152 adding iceberg notebook
5 months ago
Steve Nyemba 49ebd4a432 bug fix: close & etl
5 months ago
Steve Nyemba c3627586b3 fix: refactor cli switches
6 months ago
Steve Nyemba 2a72de4cd6 bug fixes: registry and handling cli parameters as well as adding warehousing
6 months ago
Steve Nyemba d0e655e7e3 update, community edition baseline
9 months ago
Steve L. Nyemba 492dc8f374 Merge pull request 'new provider console and bug fixes with applied commands' (#25) from v2.2.0 into master
10 months ago
Steve Nyemba 2df926da12 new provider console and bug fixes with applied commands
10 months ago
Steve L. Nyemba e848367378 Merge pull request 'bug fix, duckdb in-memory handling' (#24) from v2.2.0 into master
10 months ago
Steve Nyemba e9aab3b034 bug fix, duckdb in-memory handling
10 months ago
Steve L. Nyemba c872ba8cc2 Merge pull request 'v2.2.0 - Bug fixes with mongodb, console' (#23) from v2.2.0 into master
10 months ago
Steve Nyemba 34db729ad4 bug fixes: mongodb console
10 months ago
Steve Nyemba a7c72391e8 s3 notebook - code as documentation
12 months ago
Steve L. Nyemba baa8164f16 Merge pull request 'aws s3 notebook, brief example' (#22) from v2.2.0 into master
1 year ago
Steve Nyemba 955369fdd8 aws s3 notebook, brief example
1 year ago
Steve L. Nyemba 31556ebd32 Merge pull request 'v2.2.0 bug fix - AWS-S3' (#21) from v2.2.0 into master
1 year ago
Steve Nyemba 63666e95ce bug fix, TODO: figure out how to parse types
1 year ago
Steve Nyemba 9dba5daecd bug fix, TODO: figure out how to parse types
1 year ago
Steve Nyemba 40f9c3930a bug fixes, using boto3 instead of boto for s3 support
1 year ago
Steve L. Nyemba 1e7839198a Merge pull request 'v2.2.0 - shared environment support and duckdb support' (#20) from v2.2.0 into master
1 year ago
Steve Nyemba 3faee02fa2 documentation ...
1 year ago
Steve Nyemba 6f6fd48982 bug fixes: environment variable usage
1 year ago
Steve Nyemba 808378afdb bug fix: delegate (new feature)
1 year ago
Steve Nyemba 2edce85aed documentation duckdb support
1 year ago
Steve Nyemba 235a44be66 bug fix: registry and parameter handling
1 year ago
Steve Nyemba 037019c1d7 bug fix
1 year ago
Steve Nyemba c443c6c953 duckdb support
1 year ago
Steve Nyemba dde4767e37 new version
1 year ago
Steve L. Nyemba dce50a967e Merge pull request 'documentation ...' (#19) from v2.0.4 into master
1 year ago
Steve L. Nyemba 5ccb073865 Merge pull request 'refactor: etl,better reusability & streamlined and threaded' (#18) from v2.0.4 into master
1 year ago
Steve L. Nyemba 3081fb98e7 Merge pull request 'version 2.0 - Refactored, Plugins support' (#17) from v2.0 into master
1 year ago
Steve L. Nyemba 58959359ad Merge pull request 'bug fix: psycopg2 with numpy' (#14) from dev into master
1 year ago
Steve L. Nyemba 68b8f6af5f Merge pull request 'fixes 2024 pandas-gbq and sqlalchemy' (#10) from dev into master
1 year ago

@ -18,6 +18,20 @@ Within the virtual environment perform the following :
pip install git+https://github.com/lnyemba/data-transport.git pip install git+https://github.com/lnyemba/data-transport.git
## Features
- read/write from over a dozen databases
- run ETL jobs seamlessly
- scales and integrates into shared environments like apache zeppelin; jupyterhub; SageMaker; ...
## What's new
Unlike older versions 2.0 and under, we focus on collaborative environments like jupyter-x servers; apache zeppelin:
1. Simpler syntax to create reader or writer
2. auth-file registry that can be referenced using a label
3. duckdb support
## Learn More ## Learn More

@ -24,19 +24,28 @@ from multiprocessing import Process
import os import os
import transport import transport
from transport import etl # from transport import etl
from transport.iowrapper import IETL
# from transport import providers # from transport import providers
import typer import typer
from typing_extensions import Annotated from typing_extensions import Annotated
from typing import Optional from typing import Optional
import time import time
from termcolor import colored from termcolor import colored
from enum import Enum
from rich import print
import plugin_ix as pix
app = typer.Typer() app = typer.Typer()
app_e = typer.Typer() #-- handles etl (run, generate)
app_x = typer.Typer() #-- handles plugins (list,add, test)
app_i = typer.Typer() #-- handles information (version, license)
app_r = typer.Typer() #-- handles registry
REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport']) REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport'])
REGISTRY_FILE= 'transport-registry.json' REGISTRY_FILE= 'transport-registry.json'
CHECK_MARK = ' '.join(['[',colored(u'\u2713', 'green'),']']) CHECK_MARK = '[ [green]\u2713[/green] ]' #' '.join(['[',colored(u'\u2713', 'green'),']'])
TIMES_MARK= ' '.join(['[',colored(u'\u2717','red'),']']) TIMES_MARK= '[ [red]\u2717[/red] ]' #' '.join(['[',colored(u'\u2717','red'),']'])
# @app.command() # @app.command()
def help() : def help() :
print (__doc__) print (__doc__)
@ -44,10 +53,15 @@ def wait(jobs):
while jobs : while jobs :
jobs = [thread for thread in jobs if thread.is_alive()] jobs = [thread for thread in jobs if thread.is_alive()]
time.sleep(1) time.sleep(1)
# def wait (jobs):
# while jobs :
# jobs = [pthread for pthread in jobs if pthread.is_alive()]
@app.command(name="apply") @app_e.command(name="run")
def apply (path:Annotated[str,typer.Argument(help="path of the configuration file")], def apply (path:Annotated[str,typer.Argument(help="path of the configuration file")],
index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed")): index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed"),
batch:int = typer.Option(default=5, help="The number of parallel processes to run at once")
):
""" """
This function applies data transport ETL feature to read data from one source to write it one or several others This function applies data transport ETL feature to read data from one source to write it one or several others
""" """
@ -56,23 +70,34 @@ def apply (path:Annotated[str,typer.Argument(help="path of the configuration fil
file = open(path) file = open(path)
_config = json.loads (file.read() ) _config = json.loads (file.read() )
file.close() file.close()
if index : if index is not None:
_config = [_config[ int(index)]] _config = [_config[ int(index)]]
jobs = [] jobs = []
for _args in _config : for _args in _config :
pthread = etl.instance(**_args) #-- automatically starts the process # pthread = etl.instance(**_args) #-- automatically starts the process
def bootup ():
_worker = IETL(**_args)
_worker.run()
pthread = Process(target=bootup)
pthread.start()
jobs.append(pthread) jobs.append(pthread)
if len(jobs) == batch :
wait(jobs)
jobs = []
if jobs :
wait (jobs)
# #
# @TODO: Log the number of processes started and estimated time # @TODO: Log the number of processes started and estfrom transport impfrom transport impimated time
while jobs : # while jobs :
jobs = [pthread for pthread in jobs if pthread.is_alive()] # jobs = [pthread for pthread in jobs if pthread.is_alive()]
time.sleep(1) # time.sleep(1)
# #
# @TODO: Log the job termination here ... # @TODO: Log the job termination here ...
@app.command(name="providers") @app_i.command(name="supported")
def supported (format:Annotated[str,typer.Argument(help="format of the output, supported formats are (list,table,json)")]="table") : def supported (format:Annotated[str,typer.Argument(help="format of the output, supported formats are (list,table,json)")]="table") :
""" """
This function will print supported providers/vendors and their associated classifications This function will print supported database technologies
""" """
_df = (transport.supported()) _df = (transport.supported())
if format in ['list','json'] : if format in ['list','json'] :
@ -80,17 +105,26 @@ def supported (format:Annotated[str,typer.Argument(help="format of the output, s
else: else:
print (_df) print (_df)
print () print ()
@app_i.command(name="version")
def version ():
"""
This function will return the version of the data-transport
"""
print()
print (f'[bold] {transport.__app_name__} ,[blue] {transport.__edition__} edition [/blue], version {transport.__version__}[/bold]')
print ()
@app.command() @app_i.command(name="license")
def version(): def info():
""" """
This function will display version and license information This function will display version and license information
""" """
print()
print (transport.__app_name__,'version ',transport.__version__) print (f'[bold] {transport.__app_name__} ,{transport.__edition__}, version {transport.__version__}[/bold]')
print ()
print (transport.__license__) print (transport.__license__)
@app.command() @app_e.command()
def generate (path:Annotated[str,typer.Argument(help="path of the ETL configuration file template (name included)")]): def generate (path:Annotated[str,typer.Argument(help="path of the ETL configuration file template (name included)")]):
""" """
This function will generate a configuration template to give a sense of how to create one This function will generate a configuration template to give a sense of how to create one
@ -99,45 +133,45 @@ def generate (path:Annotated[str,typer.Argument(help="path of the ETL configurat
{ {
"source":{"provider":"http","url":"https://raw.githubusercontent.com/codeforamerica/ohana-api/master/data/sample-csv/addresses.csv"}, "source":{"provider":"http","url":"https://raw.githubusercontent.com/codeforamerica/ohana-api/master/data/sample-csv/addresses.csv"},
"target": "target":
[{"provider":"files","path":"addresses.csv","delimiter":","},{"provider":"sqlite","database":"sample.db3","table":"addresses"}] [{"provider":"files","path":"addresses.csv","delimiter":","},{"provider":"sqlite3","database":"sample.db3","table":"addresses"}]
} }
] ]
file = open(path,'w') file = open(path,'w')
file.write(json.dumps(_config)) file.write(json.dumps(_config))
file.close() file.close()
print (f"""{CHECK_MARK} Successfully generated a template ETL file at {path}""" ) print (f"""{CHECK_MARK} Successfully generated a template ETL file at [bold]{path}[/bold]""" )
print ("""NOTE: Each line (source or target) is the content of an auth-file""") print ("""NOTE: Each line (source or target) is the content of an auth-file""")
@app.command(name="init") @app_r.command(name="reset")
def initregistry (email:Annotated[str,typer.Argument(help="email")], def initregistry (email:Annotated[str,typer.Argument(help="email")],
path:str=typer.Option(default=REGISTRY_PATH,help="path or location of the configuration file"), path:str=typer.Option(default=REGISTRY_PATH,help="path or location of the configuration file"),
override:bool=typer.Option(default=False,help="override existing configuration or not")): override:bool=typer.Option(default=False,help="override existing configuration or not")):
""" """
This functiion will initialize the registry and have both application and calling code loading the database parameters by a label This functiion will initialize the data-transport registry and have both application and calling code loading the database parameters by a label
""" """
try: try:
transport.registry.init(email=email, path=path, override=override) transport.registry.init(email=email, path=path, override=override)
_msg = f"""{CHECK_MARK} Successfully wrote configuration to {path} from {email}""" _msg = f"""{CHECK_MARK} Successfully wrote configuration to [bold]{path}[/bold] from [bold]{email}[/bold]"""
except Exception as e: except Exception as e:
_msg = f"{TIMES_MARK} {e}" _msg = f"{TIMES_MARK} {e}"
print (_msg) print (_msg)
print () print ()
@app.command(name="register") @app_r.command(name="add")
def register (label:Annotated[str,typer.Argument(help="unique label that will be used to load the parameters of the database")], def register (label:Annotated[str,typer.Argument(help="unique label that will be used to load the parameters of the database")],
auth_file:Annotated[str,typer.Argument(help="path of the auth_file")], auth_file:Annotated[str,typer.Argument(help="path of the auth_file")],
default:bool=typer.Option(default=False,help="set the auth_file as default"), default:bool=typer.Option(default=False,help="set the auth_file as default"),
path:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry file")): path:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry file")):
""" """
This function will register an auth-file i.e database connection and assign it a label, This function add a database label for a given auth-file. which allows access to the database using a label of your choice.
Learn more about auth-file at https://healthcareio.the-phi.com/data-transport
""" """
try: try:
if transport.registry.exists(path) : if transport.registry.exists(path) :
transport.registry.set(label=label,auth_file=auth_file, default=default, path=path) transport.registry.set(label=label,auth_file=auth_file, default=default, path=path)
_msg = f"""{CHECK_MARK} Successfully added label "{label}" to data-transport registry""" _msg = f"""{CHECK_MARK} Successfully added label [bold]"{label}"[/bold] to data-transport registry"""
else: else:
_msg = f"""{TIMES_MARK} Registry is not initialized, please initialize the registry (check help)""" _msg = f"""{TIMES_MARK} Registry is not initialized, please initialize the registry (check help)"""
except Exception as e: except Exception as e:
@ -145,6 +179,68 @@ def register (label:Annotated[str,typer.Argument(help="unique label that will be
print (_msg) print (_msg)
pass pass
@app_x.command(name='add')
def register_plugs (
alias:Annotated[str,typer.Argument(help="unique function name within a file")],
path:Annotated[str,typer.Argument(help="path of the python file, that contains functions")],
folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry folder"),
):
"""
This function will register a file and the functions within we are interested in using
"""
if ',' in alias :
alias = [_name.strip() for _name in alias.split(',') if _name.strip() != '' ]
else:
alias = [alias.strip()]
_pregistry = pix.Registry(folder=folder,plugin_folder='plugins/code')
_log = _pregistry.set(path,alias)
# transport.registry.plugins.init()
# _log = transport.registry.plugins.add(alias,path)
_mark = TIMES_MARK if not _log else CHECK_MARK
_msg = f"""Could NOT add the [bold]{alias}[/bold]to the registry""" if not _log else f""" successfully added {alias}, {_log} functions registered"""
print (f"""{_mark} {_msg}""")
@app_x.command(name="list")
def registry_list (folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport configuration folder")):
"""
This function will list all the plugins (python functions/files) that are registered and can be reused
"""
_pregistry = pix.Registry(folder=folder)
_df = _pregistry.stats()
if _df.empty :
print (f"{TIMES_MARK} registry at {folder} is not ready")
else:
print (_df)
@app_x.command ("has")
def registry_has (alias:Annotated[str,typer.Argument(help="alias of a function function@file or file.function")],
folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry file")) :
_pregistry = pix.Registry(folder=folder)
if _pregistry.has(alias) :
_msg = f"{CHECK_MARK} {alias} was [bold] found [/bold] in registry "
else:
_msg = f"{TIMES_MARK} {alias} was [bold] NOT found [/bold] in registry "
print (_msg)
@app_x.command(name="test")
def registry_test (alias:Annotated[str,typer.Argument(help="alias of a function function@file or file.function")],
folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry folder")) :
_pregistry = pix.Registry(folder=folder)
"""
This function allows to test syntax for a plugin i.e in terms of alias@function
"""
# _item = transport.registry.plugins.has(key=key)
_pointer = _pregistry.get(alias) if _pregistry.has(alias) else None
if _pointer:
print (f"""{CHECK_MARK} successfully loaded [bold] {alias}[/bold] found in {folder}""")
else:
print (f"{TIMES_MARK} unable to load {alias}. Make sure it is registered")
app.add_typer(app_e,name='etl',help="This function will run etl or generate a template etl configuration file")
app.add_typer(app_r,name='registry',help='This function allows labeling database access information')
app.add_typer(app_i,name="info",help="This function will print either license or supported database technologies")
app.add_typer(app_x, name="plugins",help="This function enables add/list/test of plugins in the registry")
if __name__ == '__main__' : if __name__ == '__main__' :
app() app()

@ -1,7 +1,8 @@
__app_name__ = 'data-transport' __app_name__ = 'data-transport'
__author__ = 'The Phi Technology' __author__ = 'The Phi Technology'
__version__= '2.0.4' __version__= '2.2.22'
__email__ = "info@the-phi.com" __email__ = "info@the-phi.com"
__edition__= 'community'
__license__=f""" __license__=f"""
Copyright 2010 - 2024, Steve L. Nyemba Copyright 2010 - 2024, Steve L. Nyemba
@ -11,4 +12,12 @@ The above copyright notice and this permission notice shall be included in all c
THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""
__whatsnew__=f"""version {__version__},
1. Added support for read/write logs as well as plugins (when applied)
2. Bug fix with duckdb (adding readonly) for readers because there are issues with threads & processes
3. support for streaming data, important to use this with large volumes of data
""" """

@ -0,0 +1,138 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Writing to Apache Iceberg\n",
"\n",
"1. Insure you have a Google Bigquery service account key on disk\n",
"2. The service key location is set as an environment variable **BQ_KEY**\n",
"3. The dataset will be automatically created within the project associated with the service key\n",
"\n",
"The cell below creates a dataframe that will be stored within Google Bigquery"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['data transport version ', '2.4.0']\n"
]
}
],
"source": [
"#\n",
"# Writing to Google Bigquery database\n",
"#\n",
"import transport\n",
"from transport import providers\n",
"import pandas as pd\n",
"import os\n",
"\n",
"PRIVATE_KEY = os.environ['BQ_KEY'] #-- location of the service key\n",
"DATASET = 'demo'\n",
"_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n",
"# bqw = transport.get.writer(provider=providers.ICEBERG,catalog='mz',database='edw.mz',table='friends')\n",
"bqw = transport.get.writer(provider=providers.ICEBERG,table='edw.mz.friends')\n",
"bqw.write(_data,if_exists='replace') #-- default is append\n",
"print (['data transport version ', transport.__version__])\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Reading from Google Bigquery\n",
"\n",
"The cell below reads the data that has been written by the cell above and computes the average age within a Google Bigquery (simple query). \n",
"\n",
"- Basic read of the designated table (friends) created above\n",
"- Execute an aggregate SQL against the table\n",
"\n",
"**NOTE**\n",
"\n",
"By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n",
"Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" name age\n",
"0 James Bond 55\n",
"1 Steve Rogers 150\n",
"2 Steve Nyemba 44\n",
"--------- STATISTICS ------------\n"
]
}
],
"source": [
"\n",
"import transport\n",
"from transport import providers\n",
"import os\n",
"PRIVATE_KEY=os.environ['BQ_KEY']\n",
"pgr = transport.get.reader(provider=providers.ICEBERG,database='edw.mz')\n",
"_df = pgr.read(table='friends')\n",
"_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n",
"_sdf = pgr.read(sql=_query)\n",
"print (_df)\n",
"print ('--------- STATISTICS ------------')\n",
"# print (_sdf)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"An **auth-file** is a file that contains database parameters used to access the database. \n",
"For code in shared environments, we recommend \n",
"\n",
"1. Having the **auth-file** stored on disk \n",
"2. and the location of the file is set to an environment variable.\n",
"\n",
"To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -0,0 +1,149 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Writing data-transport plugins\n",
"\n",
"The data-transport plugins are designed to automate pre/post processing i.e\n",
"\n",
" - Read -> Post processing\n",
" - Write-> Pre processing\n",
" \n",
"In this example we will assume, data and write both pre/post processing to any supported infrastructure. We will equally show how to specify the plugins within a configuration file"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# Writing to Google Bigquery database\n",
"#\n",
"import transport\n",
"from transport import providers\n",
"import pandas as pd\n",
"import os\n",
"import shutil\n",
"#\n",
"#\n",
"\n",
"DATABASE = '/home/steve/tmp/demo.db3'\n",
"if os.path.exists(DATABASE) :\n",
" os.remove(DATABASE)\n",
"#\n",
"# \n",
"_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n",
"litew = transport.get.writer(provider=providers.SQLITE,database=DATABASE)\n",
"litew.write(_data,table='friends')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Reading from SQLite\n",
"\n",
"The cell below reads the data that has been written by the cell above and computes the average age from a plugin function we will write. \n",
"\n",
"- Basic read of the designated table (friends) created above\n",
"- Read with pipeline functions defined in code\n",
"\n",
"**NOTE**\n",
"\n",
"It is possible to use **transport.factory.instance** or **transport.instance** or **transport.get.<[reader|writer]>** they are the same. It allows the maintainers to know that we used a factory design pattern."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" name age\n",
"0 James Bond 55\n",
"1 Steve Rogers 150\n",
"2 Steve Nyemba 44\n",
"\n",
"\n",
" name age autoinc\n",
"0 James Bond 5.5 0\n",
"1 Steve Rogers 15.0 1\n",
"2 Steve Nyemba 4.4 2\n"
]
}
],
"source": [
"\n",
"import transport\n",
"from transport import providers\n",
"import os\n",
"import numpy as np\n",
"def _autoincrement (_data,**kwargs) :\n",
" \"\"\"\n",
" This function will add an autoincrement field to the table\n",
" \"\"\"\n",
" _data['autoinc'] = np.arange(_data.shape[0])\n",
" \n",
" return _data\n",
"def reduce(_data,**_args) :\n",
" \"\"\"\n",
" This function will reduce the age of the data frame\n",
" \"\"\"\n",
" _data.age /= 10\n",
" return _data\n",
"reader = transport.get.reader(provider=providers.SQLITE,database=DATABASE,table='friends')\n",
"#\n",
"# basic read of the data created in the first cell\n",
"_df = reader.read()\n",
"print (_df)\n",
"print ()\n",
"print()\n",
"#\n",
"# read of the data with pipeline function provided to alter the database\n",
"print (reader.read(pipeline=[_autoincrement,reduce]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The parameters for instianciating a transport object (reader or writer) can be found at [data-transport home](https://healthcareio.the-phi.com/data-transport)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -0,0 +1,131 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Writing to AWS S3\n",
"\n",
"We have setup our demo environment with the label **aws** passed to reference our s3 access_key and secret_key and file (called friends.csv). In the cell below we will write the data to our aws s3 bucket named **com.phi.demo**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.2.1\n"
]
}
],
"source": [
"#\n",
"# Writing to mongodb database\n",
"#\n",
"import transport\n",
"from transport import providers\n",
"import pandas as pd\n",
"_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n",
"mgw = transport.get.writer(label='aws')\n",
"mgw.write(_data)\n",
"print (transport.__version__)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Reading from AWS S3\n",
"\n",
"The cell below reads the data that has been written by the cell above and computes the average age within a mongodb pipeline. The code in the background executes an aggregation using\n",
"\n",
"- Basic read of the designated file **friends.csv**\n",
"- Compute average age using standard pandas functions\n",
"\n",
"**NOTE**\n",
"\n",
"By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n",
"Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" bname age\n",
"0 James Bond 55\n",
"1 Steve Rogers 150\n",
"2 Steve Nyemba 44\n",
"--------- STATISTICS ------------\n",
"83.0\n"
]
}
],
"source": [
"\n",
"import transport\n",
"from transport import providers\n",
"import pandas as pd\n",
"\n",
"def cast(stream) :\n",
" print (stream)\n",
" return pd.DataFrame(str(stream))\n",
"mgr = transport.get.reader(label='aws')\n",
"_df = mgr.read()\n",
"print (_df)\n",
"print ('--------- STATISTICS ------------')\n",
"print (_df.age.mean())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"An **auth-file** is a file that contains database parameters used to access the database. \n",
"For code in shared environments, we recommend \n",
"\n",
"1. Having the **auth-file** stored on disk \n",
"2. and the location of the file is set to an environment variable.\n",
"\n",
"To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -0,0 +1,54 @@
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "data-transport"
dynamic = ["version"]
authors = [
{name="Steve L. Nyemba" , email = "info@the-phi.com"},
]
description = ""
readme = "README.md"
license = {text = "LICENSE"}
keywords = ["mongodb","duckdb","couchdb","rabbitmq","file","read","write","s3","sqlite"]
classifiers = [
"License :: OSI Approved :: MIT License",
"Topic :: Utilities",
]
dependencies = [
"termcolor","sqlalchemy", "aiosqlite","duckdb-engine",
"mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite",
"typer","pandas","numpy","sqlalchemy","pyarrow","smart-open",
"plugin-ix@git+https://github.com/lnyemba/plugins-ix"
]
[project.optional-dependencies]
sql = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite"]
nosql = ["pymongo","cloudant"]
cloud = ["boto","boto3","botocore","pyncclient","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore"]
warehouse = ["pydrill","pyspark","sqlalchemy_drill"]
other = ["pika","flask-session"]
all = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite","pymongo","cloudant","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore","pydrill","pyspark","sqlalchemy_drill", "pika","aiosqlite","boto3","boto","botocore", "pyncclient"]
[project.urls]
Homepage = "https://healthcareio.the-phi.com/git/code/transport.git"
#[project.scripts]
#transport = "transport:main"
[tool.setuptools]
include-package-data = true
zip-safe = false
script-files = ["bin/transport"]
[tool.setuptools.packages.find]
include = ["info","info.*", "transport", "transport.*"]
[tool.setuptools.dynamic]
version = {attr = "info.__version__"}
#authors = {attr = "meta.__author__"}
# If you have a info.py file, you might also want to include the author dynamically:
# [tool.setuptools.dynamic]
# version = {attr = "info.__version__"}
# authors = {attr = "info.__author__"}

@ -1,28 +0,0 @@
"""
This is a build file for the
"""
from setuptools import setup, find_packages
import os
import sys
# from version import __version__,__author__
from info import __version__, __author__,__app_name__,__license__
def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()
args = {
"name":__app_name__,
"version":__version__,
"author":__author__,"author_email":"info@the-phi.com",
"license":__license__,
# "packages":["transport","info","transport/sql"]},
"packages": find_packages(include=['info','transport', 'transport.*'])}
args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite']
args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql']
args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git"
args['scripts'] = ['bin/transport']
# if sys.version_info[0] == 2 :
# args['use_2to3'] = True
# args['use_2to3_exclude_fixers']=['lib2to3.fixes.fix_import']
setup(**args)

@ -18,27 +18,56 @@ Source Code is available under MIT License:
""" """
import numpy as np import numpy as np
from transport import sql, nosql, cloud, other #from transport import sql, nosql, cloud, other, warehouse
from transport import sql
try:
from transport import nosql
except Exception as e:
nosql = {}
try:
from transport import cloud
except Exception as e:
cloud = {}
try:
from transport import warehouse
except Exception as e:
warehouse = {}
try:
from transport import other
except Exception as e:
other = {}
import pandas as pd import pandas as pd
import json import json
import os import os
from info import __version__,__author__,__email__,__license__,__app_name__ from info import __version__,__author__,__email__,__license__,__app_name__,__whatsnew__,__edition__
from transport.iowrapper import IWriter, IReader, IETL from transport.iowrapper import IWriter, IReader, IETL
from transport.plugins import PluginLoader from transport.plugins import PluginLoader
from transport import providers from transport import providers
import copy import copy
from transport import registry from transport import registry
from transport.plugins import Plugin
PROVIDERS = {} PROVIDERS = {}
def init(): def init():
global PROVIDERS global PROVIDERS
for _module in [cloud,sql,nosql,other] : for _module in [cloud,sql,nosql,other,warehouse] :
for _provider_name in dir(_module) : for _provider_name in dir(_module) :
if _provider_name.startswith('__') or _provider_name == 'common': if _provider_name.startswith('__') or _provider_name == 'common' or type(_module) in [None,str,dict]:
continue continue
PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__} PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__}
#
# loading the registry
if not registry.isloaded() :
registry.load()
# def _getauthfile (path) :
# f = open(path)
# _object = json.loads(f.read())
# f.close()
# return _object
def instance (**_args): def instance (**_args):
""" """
This function returns an object of to read or write from a supported database provider/vendor This function returns an object of to read or write from a supported database provider/vendor
@ -48,15 +77,6 @@ def instance (**_args):
kwargs These are arguments that are provider/vendor specific kwargs These are arguments that are provider/vendor specific
""" """
global PROVIDERS global PROVIDERS
# if not registry.isloaded () :
# if ('path' in _args and registry.exists(_args['path'] )) or registry.exists():
# registry.load() if 'path' not in _args else registry.load(_args['path'])
# print ([' GOT IT'])
# if 'label' in _args and registry.isloaded():
# _info = registry.get(_args['label'])
# if _info :
# #
# _args = dict(_args,**_info)
if 'auth_file' in _args: if 'auth_file' in _args:
if os.path.exists(_args['auth_file']) : if os.path.exists(_args['auth_file']) :
@ -77,12 +97,13 @@ def instance (**_args):
if not registry.isloaded () : if not registry.isloaded () :
if ('path' in _args and registry.exists(_args['path'] )) or registry.exists(): if ('path' in _args and registry.exists(_args['path'] )) or registry.exists():
registry.load() if 'path' not in _args else registry.load(_args['path']) registry.load() if 'path' not in _args else registry.load(_args['path'])
_info = {}
if 'label' in _args and registry.isloaded(): if 'label' in _args and registry.isloaded():
_info = registry.get(_args['label']) _info = registry.get(_args['label'])
else:
_info = registry.get()
if _info : if _info :
# _args = dict(_info,**_args) #-- we can override the registry parameters with our own arguments
_args = dict(_args,**_info)
if 'provider' in _args and _args['provider'] in PROVIDERS : if 'provider' in _args and _args['provider'] in PROVIDERS :
_info = PROVIDERS[_args['provider']] _info = PROVIDERS[_args['provider']]
@ -112,8 +133,32 @@ def instance (**_args):
# for _delegate in _params : # for _delegate in _params :
# loader.set(_delegate) # loader.set(_delegate)
loader = None if 'plugins' not in _args else _args['plugins'] _plugins = None if 'plugins' not in _args else _args['plugins']
return IReader(_agent,loader) if _context == 'read' else IWriter(_agent,loader)
# if registry.has('logger') :
# _kwa = registry.get('logger')
# _lmodule = getPROVIDERS[_kwa['provider']]
if ( ('label' in _args and _args['label'] != 'logger') and registry.has('logger')):
#
# We did not request label called logger, so we are setting up a logger if it is specified in the registry
#
_kwargs = registry.get('logger')
_kwargs['context'] = 'write'
_kwargs['table'] =_module.__name__.split('.')[-1]+'_logs'
# _logger = instance(**_kwargs)
_module = PROVIDERS[_kwargs['provider']]['module']
_logger = getattr(_module,'Writer')
_logger = _logger(**_kwargs)
else:
_logger = None
_kwargs = {'agent':_agent,'plugins':_plugins,'logger':_logger}
if 'args' in _args :
_kwargs['args'] = _args['args']
# _datatransport = IReader(_agent,_plugins,_logger) if _context == 'read' else IWriter(_agent,_plugins,_logger)
_datatransport = IReader(**_kwargs) if _context == 'read' else IWriter(**_kwargs)
return _datatransport
else: else:
# #
@ -127,22 +172,45 @@ class get :
""" """
@staticmethod @staticmethod
def reader (**_args): def reader (**_args):
if not _args : if not _args or ('provider' not in _args and 'label' not in _args):
_args['label'] = 'default' _args['label'] = 'default'
_args['context'] = 'read' _args['context'] = 'read'
return instance(**_args) # return instance(**_args)
# _args['logger'] = instance(**{'label':'logger','context':'write','table':'logs'})
_handler = instance(**_args)
# _handler.setLogger(get.logger())
return _handler
@staticmethod @staticmethod
def writer(**_args): def writer(**_args):
""" """
This function is a wrapper that will return a writer to a database. It disambiguates the interface This function is a wrapper that will return a writer to a database. It disambiguates the interface
""" """
if not _args : if not _args or ('provider' not in _args and 'label' not in _args):
_args['label'] = 'default' _args['label'] = 'default'
_args['context'] = 'write' _args['context'] = 'write'
# _args['logger'] = instance(**{'label':'logger','context':'write','table':'logs'})
_handler = instance(**_args)
#
# Implementing logging with the 'eat-your-own-dog-food' approach
# Using dependency injection to set the logger (problem with imports)
#
# _handler.setLogger(get.logger())
return _handler
@staticmethod
def logger ():
if registry.has('logger') :
_args = registry.get('logger')
_args['context'] = 'write'
return instance(**_args) return instance(**_args)
return None
@staticmethod @staticmethod
def etl (**_args): def etl (**_args):
if 'source' in _args and 'target' in _args : if 'source' in _args and 'target' in _args :
return IETL(**_args) return IETL(**_args)
else: else:
raise Exception ("Malformed input found, object must have both 'source' and 'target' attributes") raise Exception ("Malformed input found, object must have both 'source' and 'target' attributes")

@ -3,10 +3,13 @@ Data Transport - 1.0
Steve L. Nyemba, The Phi Technology LLC Steve L. Nyemba, The Phi Technology LLC
This file is a wrapper around s3 bucket provided by AWS for reading and writing content This file is a wrapper around s3 bucket provided by AWS for reading and writing content
TODO:
- Address limitations that will properly read csv if it is stored with content type text/csv
""" """
from datetime import datetime from datetime import datetime
import boto import boto3
from boto.s3.connection import S3Connection, OrdinaryCallingFormat # from boto.s3.connection import S3Connection, OrdinaryCallingFormat
import numpy as np import numpy as np
import botocore import botocore
from smart_open import smart_open from smart_open import smart_open
@ -14,6 +17,7 @@ import sys
import json import json
from io import StringIO from io import StringIO
import pandas as pd
import json import json
class s3 : class s3 :
@ -29,46 +33,37 @@ class s3 :
@param filter filename or filtering elements @param filter filename or filtering elements
""" """
try: try:
self.s3 = S3Connection(args['access_key'],args['secret_key'],calling_format=OrdinaryCallingFormat()) self._client = boto3.client('s3',aws_access_key_id=args['access_key'],aws_secret_access_key=args['secret_key'],region_name=args['region'])
self.bucket = self.s3.get_bucket(args['bucket'].strip(),validate=False) if 'bucket' in args else None self._bucket_name = args['bucket']
# self.path = args['path'] self._file_name = args['file']
self.filter = args['filter'] if 'filter' in args else None self._region = args['region']
self.filename = args['file'] if 'file' in args else None
self.bucket_name = args['bucket'] if 'bucket' in args else None
except Exception as e : except Exception as e :
self.s3 = None
self.bucket = None
print (e) print (e)
pass
def has(self,**_args):
_found = None
try:
if 'file' in _args and 'bucket' in _args:
_found = self.meta(**_args)
elif 'bucket' in _args and not 'file' in _args:
_found = self._client.list_objects(Bucket=_args['bucket'])
elif 'file' in _args and not 'bucket' in _args :
_found = self.meta(bucket=self._bucket_name,file = _args['file'])
except Exception as e:
_found = None
pass
return type(_found) == dict
def meta(self,**args): def meta(self,**args):
""" """
This function will return information either about the file in a given bucket
:name name of the bucket :name name of the bucket
""" """
info = self.list(**args) _bucket = self._bucket_name if 'bucket' not in args else args['bucket']
[item.open() for item in info] _file = self._file_name if 'file' not in args else args['file']
return [{"name":item.name,"size":item.size} for item in info] _data = self._client.get_object(Bucket=_bucket,Key=_file)
def list(self,**args): return _data['ResponseMetadata']
""" def close(self):
This function will list the content of a bucket, the bucket must be provided by the name self._client.close()
:name name of the bucket
"""
return list(self.s3.get_bucket(args['name']).list())
def buckets(self):
#
# This function will return all buckets, not sure why but it should be used cautiously
# based on why the s3 infrastructure is used
#
return [item.name for item in self.s3.get_all_buckets()]
# def buckets(self):
pass
# """
# This function is a wrapper around the bucket list of buckets for s3
# """
# return self.s3.get_all_buckets()
class Reader(s3) : class Reader(s3) :
""" """
@ -77,51 +72,66 @@ class Reader(s3) :
- stream content if file is Not None - stream content if file is Not None
@TODO: support read from all buckets, think about it @TODO: support read from all buckets, think about it
""" """
def __init__(self,**args) : def __init__(self,**_args) :
s3.__init__(self,**args) super().__init__(**_args)
def files(self):
r = [] def _stream(self,**_args):
try:
return [item.name for item in self.bucket if item.size > 0]
except Exception as e:
pass
return r
def stream(self,limit=-1):
""" """
At this point we should stream a file from a given bucket At this point we should stream a file from a given bucket
""" """
key = self.bucket.get_key(self.filename.strip()) _object = self._client.get_object(Bucket=_args['bucket'],Key=_args['file'])
if key is None : _stream = None
yield None try:
_stream = _object['Body'].read()
except Exception as e:
pass
if not _stream :
return None
if _object['ContentType'] in ['text/csv'] :
return pd.read_csv(StringIO(str(_stream).replace("\\n","\n").replace("\\r","").replace("\'","")))
else: else:
count = 0 return _stream
with smart_open(key) as remote_file:
for line in remote_file:
if count == limit and limit > 0 :
break
yield line
count += 1
def read(self,**args) : def read(self,**args) :
if self.filename is None :
# _name = self._file_name if 'file' not in args else args['file']
# returning the list of files because no one file was specified. _bucket = args['bucket'] if 'bucket' in args else self._bucket_name
return self.files() return self._stream(bucket=_bucket,file=_name)
else:
limit = args['size'] if 'size' in args else -1
return self.stream(limit)
class Writer(s3) : class Writer(s3) :
"""
def __init__(self,**args) :
s3.__init__(self,**args)
def mkdir(self,name):
""" """
This function will create a folder in a bucket def __init__(self,**_args) :
super().__init__(**_args)
#
#
if not self.has(bucket=self._bucket_name) :
self.make_bucket(self._bucket_name)
def make_bucket(self,bucket_name):
"""
This function will create a folder in a bucket,It is best that the bucket is organized as a namespace
:name name of the folder :name name of the folder
""" """
self.s3.put_object(Bucket=self.bucket_name,key=(name+'/'))
def write(self,content): self._client.create_bucket(Bucket=bucket_name,CreateBucketConfiguration={'LocationConstraint': self._region})
file = StringIO(content.decode("utf8")) def write(self,_data,**_args):
self.s3.upload_fileobj(file,self.bucket_name,self.filename) """
This function will write the data to the s3 bucket, files can be either csv, or json formatted files
"""
content = 'text/plain'
if type(_data) == pd.DataFrame :
_stream = _data.to_csv(index=False)
content = 'text/csv'
elif type(_data) == dict :
_stream = json.dumps(_data)
content = 'application/json'
else:
_stream = _data
file = StringIO(_stream)
bucket = self._bucket_name if 'bucket' not in _args else _args['bucket']
file_name = self._file_name if 'file' not in _args else _args['file']
self._client.put_object(Bucket=bucket, Key = file_name, Body=_stream,ContentType=content)
pass pass

@ -0,0 +1,19 @@
"""
This file will be intended to handle duckdb database
"""
import duckdb
from transport.common import Reader,Writer
class Duck(Reader):
def __init__(self,**_args):
super().__init__(**_args)
self._path = None if 'path' not in _args else _args['path']
self._handler = duckdb.connect() if not self._path else duckdb.connect(self._path)
class DuckReader(Duck) :
def __init__(self,**_args):
super().__init__(**_args)
def read(self,**_args) :
pass

@ -5,35 +5,28 @@ NOTE: Plugins are converted to a pipeline, so we apply a pipeline when reading o
- upon initialization we will load plugins - upon initialization we will load plugins
- on read/write we apply a pipeline (if passed as an argument) - on read/write we apply a pipeline (if passed as an argument)
""" """
from transport.plugins import plugin, PluginLoader from transport.plugins import Plugin, PluginLoader
import transport import transport
from transport import providers from transport import providers
from multiprocessing import Process from multiprocessing import Process
import time import time
import plugin_ix
class IO: class IO:
""" """
Base wrapper class for read/write and support for logs Base wrapper class for read/write and support for logs
""" """
def __init__(self,_agent,plugins): def __init__(self,**_args):
_agent = _args['agent']
plugins = _args['plugins'] if 'plugins' in _args else None
self._agent = _agent self._agent = _agent
# self._ixloader = plugin_ix.Loader () #-- must indicate where the plugin registry file is
self._ixloader = plugin_ix.Loader (registry=plugin_ix.Registry(folder=transport.registry.REGISTRY_PATH))
if plugins : if plugins :
self._init_plugins(plugins) self.init_plugins(plugins)
else:
self._plugins = None
def _init_plugins(self,_args):
"""
This function will load pipelined functions as a plugin loader
"""
if 'path' in _args and 'names' in _args :
self._plugins = PluginLoader(**_args)
else:
self._plugins = PluginLoader()
[self._plugins.set(_pointer) for _pointer in _args]
#
# @TODO: We should have a way to log what plugins are loaded and ready to use
def meta (self,**_args): def meta (self,**_args):
if hasattr(self._agent,'meta') : if hasattr(self._agent,'meta') :
return self._agent.meta(**_args) return self._agent.meta(**_args)
@ -42,40 +35,58 @@ class IO:
def close(self): def close(self):
if hasattr(self._agent,'close') : if hasattr(self._agent,'close') :
self._agent.close() self._agent.close()
def apply(self): # def apply(self):
""" # """
applying pre/post conditions given a pipeline expression # applying pre/post conditions given a pipeline expression
""" # """
for _pointer in self._plugins : # for _pointer in self._plugins :
_data = _pointer(_data) # _data = _pointer(_data)
def apply(self,_query): def apply(self,_query):
if hasattr(self._agent,'apply') : if hasattr(self._agent,'apply') :
return self._agent.apply(_query) return self._agent.apply(_query)
return None return None
def submit(self,_query):
return self.delegate('submit',_query)
def delegate(self,_name,_query):
if hasattr(self._agent,_name) :
pointer = getattr(self._agent,_name)
return pointer(_query)
return None
def init_plugins(self,plugins):
for _ref in plugins :
self._ixloader.set(_ref)
class IReader(IO): class IReader(IO):
""" """
This is a wrapper for read functionalities This is a wrapper for read functionalities
""" """
def __init__(self,_agent,pipeline=None): def __init__(self,**_args):
super().__init__(_agent,pipeline) super().__init__(**_args)
def read(self,**_args): def read(self,**_args):
if 'plugins' in _args : if 'plugins' in _args :
self._init_plugins(_args['plugins']) self.init_plugins(_args['plugins'])
_data = self._agent.read(**_args) _data = self._agent.read(**_args)
if self._plugins and self._plugins.ratio() > 0 : # if self._plugins and self._plugins.ratio() > 0 :
_data = self._plugins.apply(_data) # _data = self._plugins.apply(_data)
# #
# output data # output data
#
# applying the the design pattern
_data = self._ixloader.visitor(_data)
return _data return _data
class IWriter(IO): class IWriter(IO):
def __init__(self,_agent,pipeline=None): def __init__(self,**_args): #_agent,pipeline=None):
super().__init__(_agent,pipeline) super().__init__(**_args) #_agent,pipeline)
def write(self,_data,**_args): def write(self,_data,**_args):
# if 'plugins' in _args :
# self._init_plugins(_args['plugins'])
if 'plugins' in _args : if 'plugins' in _args :
self._init_plugins(_args['plugins']) self.init_plugins(_args['plugins'])
if self._plugins and self._plugins.ratio() > 0 :
_data = self._plugins.apply(_data)
self._ixloader.visitor(_data)
self._agent.write(_data,**_args) self._agent.write(_data,**_args)
# #
@ -87,7 +98,7 @@ class IETL(IReader) :
This class performs an ETL operation by ineriting a read and adding writes as pipeline functions This class performs an ETL operation by ineriting a read and adding writes as pipeline functions
""" """
def __init__(self,**_args): def __init__(self,**_args):
super().__init__(transport.get.reader(**_args['source'])) super().__init__(agent=transport.get.reader(**_args['source']),plugins=None)
if 'target' in _args: if 'target' in _args:
self._targets = _args['target'] if type(_args['target']) == list else [_args['target']] self._targets = _args['target'] if type(_args['target']) == list else [_args['target']]
else: else:
@ -98,16 +109,23 @@ class IETL(IReader) :
self._hasParentProcess = False if 'hasParentProcess' not in _args else _args['hasParentProcess'] self._hasParentProcess = False if 'hasParentProcess' not in _args else _args['hasParentProcess']
def read(self,**_args): def read(self,**_args):
_data = super().read(**_args) _data = super().read(**_args)
_schema = super().meta()
for _kwargs in self._targets : for _kwargs in self._targets :
if _schema :
_kwargs['schema'] = _schema
self.post(_data,**_kwargs) self.post(_data,**_kwargs)
return _data return _data
def run(self) :
return self.read()
def post (self,_data,**_args) : def post (self,_data,**_args) :
""" """
This function returns an instance of a process that will perform the write operation This function returns an instance of a process that will perform the write operation
:_args parameters associated with writer object :_args parameters associated with writer object
""" """
writer = transport.get.writer(**_args) writer = transport.get.writer(**_args)
if 'schema' in _args :
writer.write(_data,schema=_args['schema'])
else:
writer.write(_data) writer.write(_data)
writer.close() writer.close()

@ -33,6 +33,8 @@ class Mongo :
:password password for current user :password password for current user
""" """
self.host = 'localhost' if 'host' not in args else args['host'] self.host = 'localhost' if 'host' not in args else args['host']
if ':' not in self.host and 'port' in args :
self.host = ':'.join([self.host,str(args['port'])])
self.mechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism'] self.mechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism']
# authSource=(args['authSource'] if 'authSource' in args else self.dbname) # authSource=(args['authSource'] if 'authSource' in args else self.dbname)
self._lock = False if 'lock' not in args else args['lock'] self._lock = False if 'lock' not in args else args['lock']

@ -1 +1 @@
from . import files, http, rabbitmq, callback, files from . import files, http, rabbitmq, callback, files, console

@ -11,8 +11,10 @@ import importlib as IL
import importlib.util import importlib.util
import sys import sys
import os import os
import pandas as pd
import time
class plugin : class Plugin :
""" """
Implementing function decorator for data-transport plugins (post-pre)-processing Implementing function decorator for data-transport plugins (post-pre)-processing
""" """
@ -22,8 +24,9 @@ class plugin :
:mode restrict to reader/writer :mode restrict to reader/writer
:about tell what the function is about :about tell what the function is about
""" """
self._name = _args['name'] self._name = _args['name'] if 'name' in _args else None
self._about = _args['about'] self._version = _args['version'] if 'version' in _args else '0.1'
self._doc = _args['doc'] if 'doc' in _args else "N/A"
self._mode = _args['mode'] if 'mode' in _args else 'rw' self._mode = _args['mode'] if 'mode' in _args else 'rw'
def __call__(self,pointer,**kwargs): def __call__(self,pointer,**kwargs):
def wrapper(_args,**kwargs): def wrapper(_args,**kwargs):
@ -32,57 +35,67 @@ class plugin :
# @TODO: # @TODO:
# add attributes to the wrapper object # add attributes to the wrapper object
# #
self._name = pointer.__name__ if not self._name else self._name
setattr(wrapper,'transport',True) setattr(wrapper,'transport',True)
setattr(wrapper,'name',self._name) setattr(wrapper,'name',self._name)
setattr(wrapper,'mode',self._mode) setattr(wrapper,'version',self._version)
setattr(wrapper,'about',self._about) setattr(wrapper,'doc',self._doc)
return wrapper return wrapper
class PluginLoader : class PluginLoader :
""" """
This class is intended to load a plugin and make it available and assess the quality of the developed plugin This class is intended to load a plugin and make it available and assess the quality of the developed plugin
""" """
def __init__(self,**_args): def __init__(self,**_args):
""" """
:path location of the plugin (should be a single file)
:_names of functions to load
""" """
_names = _args['names'] if 'names' in _args else None # _names = _args['names'] if 'names' in _args else None
path = _args['path'] if 'path' in _args else None # path = _args['path'] if 'path' in _args else None
self._names = _names if type(_names) == list else [_names] # self._names = _names if type(_names) == list else [_names]
self._modules = {} self._modules = {}
self._names = [] self._names = []
if path and os.path.exists(path) and _names: self._registry = _args['registry']
for _name in self._names :
spec = importlib.util.spec_from_file_location('private', path) pass
def load (self,**_args):
"""
This function loads a plugin
"""
self._modules = {}
self._names = []
path = _args ['path']
if os.path.exists(path) :
_alias = path.split(os.sep)[-1]
spec = importlib.util.spec_from_file_location(_alias, path)
module = importlib.util.module_from_spec(spec) module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module) #--loads it into sys.modules spec.loader.exec_module(module) #--loads it into sys.modules
if hasattr(module,_name) : for _name in dir(module) :
if self.isplugin(module,_name) : if self.isplugin(module,_name) :
self._modules[_name] = getattr(module,_name) self._module[_name] = getattr(module,_name)
else: # self._names [_name]
print ([f'Found {_name}', 'not plugin']) def format (self,**_args):
else: uri = _args['alias'],_args['name']
# # def set(self,_pointer) :
# @TODO: We should log this somewhere some how def set(self,_key) :
print (['skipping ',_name, hasattr(module,_name)])
pass
else:
#
# Initialization is empty
self._names = []
pass
def set(self,_pointer) :
""" """
This function will set a pointer to the list of modules to be called This function will set a pointer to the list of modules to be called
This should be used within the context of using the framework as a library This should be used within the context of using the framework as a library
""" """
_name = _pointer.__name__ if type(_key).__name__ == 'function':
#
# The pointer is in the code provided by the user and loaded in memory
#
_pointer = _key
_key = 'inline@'+_key.__name__
# self._names.append(_key.__name__)
else:
_pointer = self._registry.get(key=_key)
if _pointer :
self._modules[_key] = _pointer
self._names.append(_key)
self._modules[_name] = _pointer
self._names.append(_name)
def isplugin(self,module,name): def isplugin(self,module,name):
""" """
This function determines if a module is a recognized plugin This function determines if a module is a recognized plugin
@ -107,12 +120,31 @@ class PluginLoader :
_n = len(self._names) _n = len(self._names)
return len(set(self._modules.keys()) & set (self._names)) / _n return len(set(self._modules.keys()) & set (self._names)) / _n
def apply(self,_data): def apply(self,_data,_logger=[]):
_input= {}
for _name in self._modules : for _name in self._modules :
try:
_input = {'action':'plugin','object':_name,'input':{'status':'PASS'}}
_pointer = self._modules[_name] _pointer = self._modules[_name]
if type(_data) == list :
_data = pd.DataFrame(_data)
_brow,_bcol = list(_data.shape)
# #
# @TODO: add exception handling # @TODO: add exception handling
_data = _pointer(_data) _data = _pointer(_data)
_input['input']['shape'] = {'rows-dropped':_brow - _data.shape[0]}
except Exception as e:
_input['input']['status'] = 'FAILED'
print (e)
time.sleep(1)
if _logger:
try:
_logger(**_input)
except Exception as e:
pass
return _data return _data
# def apply(self,_data,_name): # def apply(self,_data,_name):
# """ # """

@ -10,8 +10,11 @@ HTTP='http'
BIGQUERY ='bigquery' BIGQUERY ='bigquery'
FILE = 'file' FILE = 'file'
ETL = 'etl' ETL = 'etl'
SQLITE = 'sqlite'
SQLITE = 'sqlite3'
SQLITE3= 'sqlite3' SQLITE3= 'sqlite3'
DUCKDB = 'duckdb'
REDSHIFT = 'redshift' REDSHIFT = 'redshift'
NETEZZA = 'netezza' NETEZZA = 'netezza'
MYSQL = 'mysql' MYSQL = 'mysql'
@ -41,6 +44,9 @@ PGSQL = POSTGRESQL
AWS_S3 = 's3' AWS_S3 = 's3'
RABBIT = RABBITMQ RABBIT = RABBITMQ
ICEBERG='iceberg'
APACHE_ICEBERG = 'iceberg'
DRILL = 'drill'
APACHE_DRILL = 'drill'
# QLISTENER = 'qlistener' # QLISTENER = 'qlistener'

@ -3,46 +3,59 @@ import json
from info import __version__ from info import __version__
import copy import copy
import transport import transport
import importlib
import importlib.util
import shutil
from io import StringIO
""" """
This class manages data from the registry and allows (read only) This class manages data from the registry and allows (read only)
@TODO: add property to the DATA attribute @TODO: add property to the DATA attribute
""" """
if 'HOME' in os.environ :
REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport'])
else:
REGISTRY_PATH=os.sep.join([os.environ['USERPROFILE'],'.data-transport'])
REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport']) #
# This path can be overriden by an environment variable ...
#
if 'DATA_TRANSPORT_REGISTRY_PATH' in os.environ :
REGISTRY_PATH = os.environ['DATA_TRANSPORT_REGISTRY_PATH']
REGISTRY_FILE= 'transport-registry.json' REGISTRY_FILE= 'transport-registry.json'
DATA = {} DATA = {}
def isloaded (): def isloaded ():
return DATA not in [{},None] return DATA not in [{},None]
def exists (path=REGISTRY_PATH) : def exists (path=REGISTRY_PATH,_file=REGISTRY_FILE) :
""" """
This function determines if there is a registry at all This function determines if there is a registry at all
""" """
p = os.path.exists(path) p = os.path.exists(path)
q = os.path.exists( os.sep.join([path,REGISTRY_FILE])) q = os.path.exists( os.sep.join([path,_file]))
return p and q return p and q
def load (_path=REGISTRY_PATH): def load (_path=REGISTRY_PATH,_file=REGISTRY_FILE):
global DATA global DATA
if exists(_path) : if exists(_path) :
path = os.sep.join([_path,REGISTRY_FILE]) path = os.sep.join([_path,_file])
f = open(path) f = open(path)
DATA = json.loads(f.read()) DATA = json.loads(f.read())
f.close() f.close()
def init (email,path=REGISTRY_PATH,override=False): def init (email,path=REGISTRY_PATH,override=False,_file=REGISTRY_FILE):
""" """
Initializing the registry and will raise an exception in the advent of an issue Initializing the registry and will raise an exception in the advent of an issue
""" """
p = '@' in email p = '@' in email
q = False if '.' not in email else email.split('.')[-1] in ['edu','com','io','ai'] #q = False if '.' not in email else email.split('.')[-1] in ['edu','com','io','ai','org']
q = len(email.split('.')[-1]) in [2,3]
if p and q : if p and q :
_config = {"email":email,'version':__version__} _config = {"email":email,'version':__version__}
if not os.path.exists(path): if not os.path.exists(path):
os.makedirs(path) os.makedirs(path)
filename = os.sep.join([path,REGISTRY_FILE]) filename = os.sep.join([path,_file])
if not os.path.exists(filename) or override == True : if not os.path.exists(filename) or override == True :
f = open(filename,'w') f = open(filename,'w')
@ -57,6 +70,8 @@ def init (email,path=REGISTRY_PATH,override=False):
def lookup (label): def lookup (label):
global DATA global DATA
return label in DATA return label in DATA
has = lookup
def get (label='default') : def get (label='default') :
global DATA global DATA
return copy.copy(DATA[label]) if label in DATA else {} return copy.copy(DATA[label]) if label in DATA else {}
@ -68,8 +83,11 @@ def set (label, auth_file, default=False,path=REGISTRY_PATH) :
if label == 'default' : if label == 'default' :
raise Exception ("""Invalid label name provided, please change the label name and use the switch""") raise Exception ("""Invalid label name provided, please change the label name and use the switch""")
reg_file = os.sep.join([path,REGISTRY_FILE]) reg_file = os.sep.join([path,REGISTRY_FILE])
if os.path.exists (auth_file) and os.path.exists(path) and os.path.exists(reg_file): if os.path.exists(path) and os.path.exists(reg_file):
if type(auth_file) == str and os.path.exists (auth_file) :
f = open(auth_file) f = open(auth_file)
elif type(auth_file) == StringIO:
f = auth_file
_info = json.loads(f.read()) _info = json.loads(f.read())
f.close() f.close()
f = open(reg_file) f = open(reg_file)

@ -3,7 +3,7 @@ This namespace/package wrap the sql functionalities for a certain data-stores
- netezza, postgresql, mysql and sqlite - netezza, postgresql, mysql and sqlite
- mariadb, redshift (also included) - mariadb, redshift (also included)
""" """
from . import postgresql, mysql, netezza, sqlite, sqlserver from . import postgresql, mysql, netezza, sqlite, sqlserver, duckdb
# #

@ -3,6 +3,8 @@ This file encapsulates common operations associated with SQL databases via SQLAl
""" """
import sqlalchemy as sqa import sqlalchemy as sqa
from sqlalchemy import text , MetaData, inspect
import pandas as pd import pandas as pd
class Base: class Base:
@ -11,7 +13,13 @@ class Base:
self._port = None self._port = None
self._database = _args['database'] self._database = _args['database']
self._table = _args['table'] if 'table' in _args else None self._table = _args['table'] if 'table' in _args else None
self._engine= sqa.create_engine(self._get_uri(**_args),future=True) _uri = self._get_uri(**_args)
if type(_uri) == str :
self._engine= sqa.create_engine(_uri,future=True)
else:
_uri,_kwargs = _uri
self._engine= sqa.create_engine(_uri,**_kwargs,future=True)
def _set_uri(self,**_args) : def _set_uri(self,**_args) :
""" """
:provider provider :provider provider
@ -32,21 +40,33 @@ class Base:
:table optional name of the table (can be fully qualified) :table optional name of the table (can be fully qualified)
""" """
_table = self._table if 'table' not in _args else _args['table'] _table = self._table if 'table' not in _args else _args['table']
_map = {'TINYINT':'INTEGER','BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'}
_schema = [] _schema = []
if _table : # if _table :
if sqa.__version__.startswith('1.') : # if sqa.__version__.startswith('1.') :
_handler = sqa.MetaData(bind=self._engine) # _handler = sqa.MetaData(bind=self._engine)
_handler.reflect() # _handler.reflect()
else: # else:
# # #
# sqlalchemy's version 2.+ # # sqlalchemy's version 2.+
_handler = sqa.MetaData() # _handler = sqa.MetaData()
_handler.reflect(bind=self._engine) # _handler.reflect(bind=self._engine)
# #
# # Let us extract the schema with the native types
# _map = {'BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'}
# _schema = [{"name":_attr.name,"type":_map.get(str(_attr.type),str(_attr.type))} for _attr in _handler.tables[_table].columns]
# #
# Let us extract the schema with the native types try:
_map = {'BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'} if _table :
_schema = [{"name":_attr.name,"type":_map.get(str(_attr.type),str(_attr.type))} for _attr in _handler.tables[_table].columns] _inspector = inspect(self._engine)
_columns = _inspector.get_columns(_table)
_schema = [{'name':column['name'],'type':_map.get(str(column['type']),str(column['type'])) } for column in _columns]
return _schema return _schema
except Exception as e:
pass
# else:
return []
def has(self,**_args): def has(self,**_args):
return self.meta(**_args) return self.meta(**_args)
def apply(self,sql): def apply(self,sql):
@ -56,11 +76,20 @@ class Base:
@TODO: Execution of stored procedures @TODO: Execution of stored procedures
""" """
return pd.read_sql(sql,self._engine) if sql.lower().startswith('select') or sql.lower().startswith('with') else None if sql.strip().lower().startswith('select') or sql.strip().lower().startswith('with') or sql.strip().startswith('show'):
return pd.read_sql(sql,self._engine)
else:
_handler = self._engine.connect()
_handler.execute(text(sql))
_handler.commit ()
_handler.close()
return None
class SQLBase(Base): class SQLBase(Base):
def __init__(self,**_args): def __init__(self,**_args):
super().__init__(**_args) super().__init__(**_args)
self._schema = _args.get('schema',None)
def get_provider(self): def get_provider(self):
raise Exception ("Provider Needs to be set ...") raise Exception ("Provider Needs to be set ...")
def get_default_port(self) : def get_default_port(self) :
@ -84,7 +113,11 @@ class SQLBase(Base):
# _uri = [_item.strip() for _item in _uri if _item.strip()] # _uri = [_item.strip() for _item in _uri if _item.strip()]
# return '/'.join(_uri) # return '/'.join(_uri)
return f'{_provider}://{_host}/{_database}' if _account == '' else f'{_provider}://{_account}{_host}/{_database}' return f'{_provider}://{_host}/{_database}' if _account == '' else f'{_provider}://{_account}{_host}/{_database}'
def close(self,) :
try:
self._engine.dispose()
except :
pass
class BaseReader(SQLBase): class BaseReader(SQLBase):
def __init__(self,**_args): def __init__(self,**_args):
super().__init__(**_args) super().__init__(**_args)
@ -96,6 +129,8 @@ class BaseReader(SQLBase):
sql = _args['sql'] sql = _args['sql']
else: else:
_table = _args['table'] if 'table' in _args else self._table _table = _args['table'] if 'table' in _args else self._table
if self._schema and type(self._schema) == str :
_table = f'{self._schema}.{_table}'
sql = f'SELECT * FROM {_table}' sql = f'SELECT * FROM {_table}'
return self.apply(sql) return self.apply(sql)
@ -106,9 +141,11 @@ class BaseWriter (SQLBase):
""" """
def __init__(self,**_args): def __init__(self,**_args):
super().__init__(**_args) super().__init__(**_args)
def write(self,_data,**_args): def write(self,_data,**_args):
if type(_data) == dict : if type(_data) == dict :
_df = pd.DataFrame(_data) _df = pd.DataFrame([_data])
elif type(_data) == list : elif type(_data) == list :
_df = pd.DataFrame(_data) _df = pd.DataFrame(_data)
else: else:
@ -125,5 +162,8 @@ class BaseWriter (SQLBase):
# _mode['schema'] = _args['schema'] # _mode['schema'] = _args['schema']
# if 'if_exists' in _args : # if 'if_exists' in _args :
# _mode['if_exists'] = _args['if_exists'] # _mode['if_exists'] = _args['if_exists']
if 'schema' in _args and type(_args['schema']) == str:
self._schema = _args.get('schema',None)
if self._schema :
_mode['schema'] = self._schema
_df.to_sql(_table,self._engine,**_mode) _df.to_sql(_table,self._engine,**_mode)

@ -0,0 +1,26 @@
"""
This module implements the handler for duckdb (in memory or not)
"""
from transport.sql.common import Base, BaseReader, BaseWriter
class Duck :
def __init__(self,**_args):
#
# duckdb with none as database will operate as an in-memory database
#
self.database = _args['database'] if 'database' in _args else ''
def get_provider(self):
return "duckdb"
def _get_uri(self,**_args):
return f"""duckdb:///{self.database}"""
class Reader(Duck,BaseReader) :
def __init__(self,**_args):
Duck.__init__(self,**_args)
BaseReader.__init__(self,**_args)
def _get_uri(self,**_args):
return super()._get_uri(**_args),{'connect_args':{'read_only':True}}
class Writer(Duck,BaseWriter):
def __init__(self,**_args):
Duck.__init__(self,**_args)
BaseWriter.__init__(self,**_args)

@ -0,0 +1,7 @@
"""
This namespace/package is intended to handle read/writes against data warehouse solutions like :
- apache iceberg
- clickhouse (...)
"""
from . import iceberg, drill

@ -0,0 +1,55 @@
import sqlalchemy
import pandas as pd
from .. sql.common import BaseReader , BaseWriter
import sqlalchemy as sqa
class Drill :
__template = {'host':None,'port':None,'ssl':None,'table':None,'database':None}
def __init__(self,**_args):
self._host = _args['host'] if 'host' in _args else 'localhost'
self._port = _args['port'] if 'port' in _args else self.get_default_port()
self._ssl = False if 'ssl' not in _args else _args['ssl']
self._table = _args['table'] if 'table' in _args else None
if self._table and '.' in self._table :
_seg = self._table.split('.')
if len(_seg) > 2 :
self._schema,self._database = _seg[:2]
else:
self._database=_args['database']
self._schema = self._database.split('.')[0]
def _get_uri(self,**_args):
return f'drill+sadrill://{self._host}:{self._port}/{self._database}?use_ssl={self._ssl}'
def get_provider(self):
return "drill+sadrill"
def get_default_port(self):
return "8047"
def meta(self,**_args):
_table = _args['table'] if 'table' in _args else self._table
if '.' in _table :
_schema = _table.split('.')[:2]
_schema = '.'.join(_schema)
_table = _table.split('.')[-1]
else:
_schema = self._schema
# _sql = f"select COLUMN_NAME AS name, CASE WHEN DATA_TYPE ='CHARACTER VARYING' THEN 'CHAR ( 125 )' ELSE DATA_TYPE END AS type from INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA='{_schema}' and TABLE_NAME='{_table}'"
_sql = f"select COLUMN_NAME AS name, CASE WHEN DATA_TYPE ='CHARACTER VARYING' THEN 'CHAR ( '||COLUMN_SIZE||' )' ELSE DATA_TYPE END AS type from INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA='{_schema}' and TABLE_NAME='{_table}'"
try:
_df = pd.read_sql(_sql,self._engine)
return _df.to_dict(orient='records')
except Exception as e:
print (e)
pass
return []
class Reader (Drill,BaseReader) :
def __init__(self,**_args):
super().__init__(**_args)
self._chunksize = 0 if 'chunksize' not in _args else _args['chunksize']
self._engine= sqa.create_engine(self._get_uri(),future=True)
class Writer(Drill,BaseWriter):
def __init__(self,**_args):
super().__init__(self,**_args)

@ -0,0 +1,151 @@
"""
dependency:
- spark and SPARK_HOME environment variable must be set
NOTE:
When using streaming option, insure that it is inline with default (1000 rows) or increase it in spark-defaults.conf
"""
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.functions import col, to_date, to_timestamp
import copy
class Iceberg :
def __init__(self,**_args):
"""
providing catalog meta information (you must get this from apache iceberg)
"""
#
# Turning off logging (it's annoying & un-professional)
#
# _spconf = SparkContext()
# _spconf.setLogLevel("ERROR")
#
# @TODO:
# Make arrangements for additional configuration elements
#
self._session = SparkSession.builder.appName("data-transport").getOrCreate()
self._session.conf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS")
# self._session.sparkContext.setLogLevel("ERROR")
self._catalog = self._session.catalog
self._table = _args['table'] if 'table' in _args else None
if 'catalog' in _args :
#
# Let us set the default catalog
self._catalog.setCurrentCatalog(_args['catalog'])
else:
# No current catalog has been set ...
pass
if 'database' in _args :
self._database = _args['database']
self._catalog.setCurrentDatabase(self._database)
else:
#
# Should we set the default as the first one if available ?
#
pass
self._catalogName = self._catalog.currentCatalog()
self._databaseName = self._catalog.currentDatabase()
def meta (self,**_args) :
"""
This function should return the schema of a table (only)
"""
_schema = []
try:
_table = _args['table'] if 'table' in _args else self._table
_tableName = self._getPrefix(**_args) + f".{_table}"
_tmp = self._session.table(_tableName).schema
_schema = _tmp.jsonValue()['fields']
for _item in _schema :
del _item['nullable'],_item['metadata']
except Exception as e:
pass
return _schema
def _getPrefix (self,**_args):
_catName = self._catalogName if 'catalog' not in _args else _args['catalog']
_datName = self._databaseName if 'database' not in _args else _args['database']
return '.'.join([_catName,_datName])
def apply(self,_query):
"""
sql query/command to run against apache iceberg
"""
return self._session.sql(_query).toPandas()
def has (self,**_args):
try:
_prefix = self._getPrefix(**_args)
if _prefix.endswith('.') :
return False
return _args['table'] in [_item.name for _item in self._catalog.listTables(_prefix)]
except Exception as e:
print (e)
return False
def close(self):
self._session.stop()
class Reader(Iceberg) :
def __init__(self,**_args):
super().__init__(**_args)
def read(self,**_args):
_table = self._table
_prefix = self._getPrefix(**_args)
if 'table' in _args or _table:
_table = _args['table'] if 'table' in _args else _table
_table = _prefix + f'.{_table}'
return self._session.table(_table).toPandas()
else:
sql = _args['sql']
return self._session.sql(sql).toPandas()
pass
class Writer (Iceberg):
"""
Writing data to an Apache Iceberg data warehouse (using pyspark)
"""
def __init__(self,**_args):
super().__init__(**_args)
self._mode = 'append' if 'mode' not in _args else _args['mode']
self._table = None if 'table' not in _args else _args['table']
def format (self,_schema) :
_iceSchema = StructType([])
_map = {'integer':IntegerType(),'float':DoubleType(),'double':DoubleType(),'date':DateType(),
'timestamp':TimestampType(),'datetime':TimestampType(),'string':StringType(),'varchar':StringType()}
for _item in _schema :
_name = _item['name']
_type = _item['type'].lower()
if _type not in _map :
_iceType = StringType()
else:
_iceType = _map[_type]
_iceSchema.add (StructField(_name,_iceType,True))
return _iceSchema if len(_iceSchema) else []
def write(self,_data,**_args):
_prefix = self._getPrefix(**_args)
if 'table' not in _args and not self._table :
raise Exception (f"Table Name should be specified for catalog/database {_prefix}")
_schema = self.format(_args['schema']) if 'schema' in _args else []
if not _schema :
rdd = self._session.createDataFrame(_data,verifySchema=False)
else :
rdd = self._session.createDataFrame(_data,schema=_schema,verifySchema=True)
_mode = self._mode if 'mode' not in _args else _args['mode']
_table = self._table if 'table' not in _args else _args['table']
# print (_data.shape,_mode,_table)
if not self._session.catalog.tableExists(_table):
# # @TODO:
# # add partitioning information here
rdd.writeTo(_table).using('iceberg').create()
# # _mode = 'overwrite'
# # rdd.write.format('iceberg').mode(_mode).saveAsTable(_table)
else:
# rdd.writeTo(_table).append()
# # _table = f'{_prefix}.{_table}'
rdd.coalesce(10).write.format('iceberg').mode('append').save(_table)
Loading…
Cancel
Save