commit
8421511446
@ -0,0 +1,6 @@
|
||||
build
|
||||
*.pyc
|
||||
*.csv
|
||||
*.json
|
||||
*.swp
|
||||
*.egg-info
|
@ -1,3 +1,40 @@
|
||||
<<<<<<< HEAD
|
||||
# ce
|
||||
|
||||
community edition of data-transport
|
||||
=======
|
||||
# Introduction
|
||||
|
||||
This project implements an abstraction of objects that can have access to a variety of data stores, implementing read/write with a simple and expressive interface. This abstraction works with **NoSQL**, **SQL** and **Cloud** data stores and leverages **pandas**.
|
||||
|
||||
# Why Use Data-Transport ?
|
||||
|
||||
Data transport is a simple framework that:
|
||||
- easy to install & modify (open-source)
|
||||
- enables access to multiple database technologies (pandas, SQLAlchemy)
|
||||
- enables notebook sharing without exposing database credential.
|
||||
- supports pre/post processing specifications (pipeline)
|
||||
|
||||
|
||||
## Installation
|
||||
|
||||
Within the virtual environment perform the following :
|
||||
|
||||
pip install git+https://github.com/lnyemba/data-transport.git
|
||||
|
||||
Options to install components in square brackets
|
||||
|
||||
pip install data-transport[nosql,cloud,warehouse,all]@git+https://github.com/lnyemba/data-transport.git
|
||||
|
||||
|
||||
## Additional features
|
||||
|
||||
- In addition to read/write, there is support for functions for pre/post processing
|
||||
- CLI interface to add to registry, run ETL
|
||||
- scales and integrates into shared environments like apache zeppelin; jupyterhub; SageMaker; ...
|
||||
|
||||
|
||||
## Learn More
|
||||
|
||||
We have available notebooks with sample code to read/write against mongodb, couchdb, Netezza, PostgreSQL, Google Bigquery, Databricks, Microsoft SQL Server, MySQL ... Visit [data-transport homepage](https://healthcareio.the-phi.com/data-transport)
|
||||
>>>>>>> v2.2.0
|
||||
|
@ -0,0 +1,247 @@
|
||||
#!/usr/bin/env python
|
||||
__doc__ = """
|
||||
(c) 2018 - 2021 data-transport
|
||||
steve@the-phi.com, The Phi Technology LLC
|
||||
https://dev.the-phi.com/git/steve/data-transport.git
|
||||
|
||||
This program performs ETL between 9 supported data sources : Couchdb, Mongodb, Mysql, Mariadb, PostgreSQL, Netezza,Redshift, Sqlite, File
|
||||
LICENSE (MIT)
|
||||
Copyright 2016-2020, The Phi Technology LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import json
|
||||
import sys
|
||||
import transport
|
||||
import time
|
||||
from multiprocessing import Process
|
||||
|
||||
import os
|
||||
import transport
|
||||
# from transport import etl
|
||||
from transport.iowrapper import IETL
|
||||
# from transport import providers
|
||||
import typer
|
||||
from typing_extensions import Annotated
|
||||
from typing import Optional
|
||||
import time
|
||||
from termcolor import colored
|
||||
from enum import Enum
|
||||
from rich import print
|
||||
import plugin_ix as pix
|
||||
|
||||
|
||||
app = typer.Typer()
|
||||
app_e = typer.Typer() #-- handles etl (run, generate)
|
||||
app_x = typer.Typer() #-- handles plugins (list,add, test)
|
||||
app_i = typer.Typer() #-- handles information (version, license)
|
||||
app_r = typer.Typer() #-- handles registry
|
||||
REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport'])
|
||||
REGISTRY_FILE= 'transport-registry.json'
|
||||
CHECK_MARK = '[ [green]\u2713[/green] ]' #' '.join(['[',colored(u'\u2713', 'green'),']'])
|
||||
TIMES_MARK= '[ [red]\u2717[/red] ]' #' '.join(['[',colored(u'\u2717','red'),']'])
|
||||
# @app.command()
|
||||
def help() :
|
||||
print (__doc__)
|
||||
def wait(jobs):
|
||||
while jobs :
|
||||
jobs = [thread for thread in jobs if thread.is_alive()]
|
||||
time.sleep(1)
|
||||
# def wait (jobs):
|
||||
# while jobs :
|
||||
# jobs = [pthread for pthread in jobs if pthread.is_alive()]
|
||||
|
||||
@app_e.command(name="run")
|
||||
def apply (path:Annotated[str,typer.Argument(help="path of the configuration file")],
|
||||
index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed"),
|
||||
batch:int = typer.Option(default=5, help="The number of parallel processes to run at once")
|
||||
):
|
||||
"""
|
||||
This function applies data transport ETL feature to read data from one source to write it one or several others
|
||||
"""
|
||||
# _proxy = lambda _object: _object.write(_object.read())
|
||||
if os.path.exists(path):
|
||||
file = open(path)
|
||||
_config = json.loads (file.read() )
|
||||
file.close()
|
||||
if index is not None:
|
||||
_config = [_config[ int(index)]]
|
||||
jobs = []
|
||||
for _args in _config :
|
||||
# pthread = etl.instance(**_args) #-- automatically starts the process
|
||||
def bootup ():
|
||||
_worker = IETL(**_args)
|
||||
_worker.run()
|
||||
pthread = Process(target=bootup)
|
||||
pthread.start()
|
||||
jobs.append(pthread)
|
||||
if len(jobs) == batch :
|
||||
wait(jobs)
|
||||
jobs = []
|
||||
|
||||
if jobs :
|
||||
wait (jobs)
|
||||
#
|
||||
# @TODO: Log the number of processes started and estfrom transport impfrom transport impimated time
|
||||
# while jobs :
|
||||
# jobs = [pthread for pthread in jobs if pthread.is_alive()]
|
||||
# time.sleep(1)
|
||||
#
|
||||
# @TODO: Log the job termination here ...
|
||||
@app_i.command(name="supported")
|
||||
def supported (format:Annotated[str,typer.Argument(help="format of the output, supported formats are (list,table,json)")]="table") :
|
||||
"""
|
||||
This function will print supported database technologies
|
||||
"""
|
||||
_df = (transport.supported())
|
||||
if format in ['list','json'] :
|
||||
print (json.dumps(_df.to_dict(orient="list")))
|
||||
else:
|
||||
print (_df)
|
||||
print ()
|
||||
@app_i.command(name="version")
|
||||
def version ():
|
||||
"""
|
||||
This function will return the version of the data-transport
|
||||
"""
|
||||
print()
|
||||
print (f'[bold] {transport.__app_name__} ,[blue] {transport.__edition__} edition [/blue], version {transport.__version__}[/bold]')
|
||||
print ()
|
||||
|
||||
@app_i.command(name="license")
|
||||
def info():
|
||||
"""
|
||||
This function will display version and license information
|
||||
"""
|
||||
print()
|
||||
print (f'[bold] {transport.__app_name__} ,{transport.__edition__}, version {transport.__version__}[/bold]')
|
||||
print ()
|
||||
print (transport.__license__)
|
||||
|
||||
@app_e.command()
|
||||
def generate (path:Annotated[str,typer.Argument(help="path of the ETL configuration file template (name included)")]):
|
||||
"""
|
||||
This function will generate a configuration template to give a sense of how to create one
|
||||
"""
|
||||
_config = [
|
||||
{
|
||||
"source":{"provider":"http","url":"https://raw.githubusercontent.com/codeforamerica/ohana-api/master/data/sample-csv/addresses.csv"},
|
||||
"target":
|
||||
[{"provider":"files","path":"addresses.csv","delimiter":","},{"provider":"sqlite3","database":"sample.db3","table":"addresses"}]
|
||||
}
|
||||
]
|
||||
file = open(path,'w')
|
||||
file.write(json.dumps(_config))
|
||||
file.close()
|
||||
print (f"""{CHECK_MARK} Successfully generated a template ETL file at [bold]{path}[/bold]""" )
|
||||
print ("""NOTE: Each line (source or target) is the content of an auth-file""")
|
||||
|
||||
|
||||
|
||||
@app_r.command(name="reset")
|
||||
def initregistry (email:Annotated[str,typer.Argument(help="email")],
|
||||
path:str=typer.Option(default=REGISTRY_PATH,help="path or location of the configuration file"),
|
||||
override:bool=typer.Option(default=False,help="override existing configuration or not")):
|
||||
"""
|
||||
This functiion will initialize the data-transport registry and have both application and calling code loading the database parameters by a label
|
||||
|
||||
"""
|
||||
try:
|
||||
transport.registry.init(email=email, path=path, override=override)
|
||||
_msg = f"""{CHECK_MARK} Successfully wrote configuration to [bold]{path}[/bold] from [bold]{email}[/bold]"""
|
||||
except Exception as e:
|
||||
_msg = f"{TIMES_MARK} {e}"
|
||||
print (_msg)
|
||||
print ()
|
||||
@app_r.command(name="add")
|
||||
def register (label:Annotated[str,typer.Argument(help="unique label that will be used to load the parameters of the database")],
|
||||
auth_file:Annotated[str,typer.Argument(help="path of the auth_file")],
|
||||
default:bool=typer.Option(default=False,help="set the auth_file as default"),
|
||||
path:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry file")):
|
||||
"""
|
||||
This function add a database label for a given auth-file. which allows access to the database using a label of your choice.
|
||||
|
||||
"""
|
||||
try:
|
||||
if transport.registry.exists(path) :
|
||||
transport.registry.set(label=label,auth_file=auth_file, default=default, path=path)
|
||||
_msg = f"""{CHECK_MARK} Successfully added label [bold]"{label}"[/bold] to data-transport registry"""
|
||||
else:
|
||||
_msg = f"""{TIMES_MARK} Registry is not initialized, please initialize the registry (check help)"""
|
||||
except Exception as e:
|
||||
_msg = f"""{TIMES_MARK} {e}"""
|
||||
print (_msg)
|
||||
|
||||
pass
|
||||
@app_x.command(name='add')
|
||||
def register_plugs (
|
||||
alias:Annotated[str,typer.Argument(help="unique function name within a file")],
|
||||
path:Annotated[str,typer.Argument(help="path of the python file, that contains functions")],
|
||||
folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry folder"),
|
||||
|
||||
):
|
||||
"""
|
||||
This function will register a file and the functions within we are interested in using
|
||||
"""
|
||||
if ',' in alias :
|
||||
alias = [_name.strip() for _name in alias.split(',') if _name.strip() != '' ]
|
||||
else:
|
||||
alias = [alias.strip()]
|
||||
_pregistry = pix.Registry(folder=folder,plugin_folder='plugins/code')
|
||||
_log = _pregistry.set(path,alias)
|
||||
# transport.registry.plugins.init()
|
||||
# _log = transport.registry.plugins.add(alias,path)
|
||||
_mark = TIMES_MARK if not _log else CHECK_MARK
|
||||
_msg = f"""Could NOT add the [bold]{alias}[/bold]to the registry""" if not _log else f""" successfully added {alias}, {_log} functions registered"""
|
||||
print (f"""{_mark} {_msg}""")
|
||||
@app_x.command(name="list")
|
||||
def registry_list (folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport configuration folder")):
|
||||
"""
|
||||
This function will list all the plugins (python functions/files) that are registered and can be reused
|
||||
"""
|
||||
_pregistry = pix.Registry(folder=folder)
|
||||
_df = _pregistry.stats()
|
||||
if _df.empty :
|
||||
print (f"{TIMES_MARK} registry at {folder} is not ready")
|
||||
else:
|
||||
print (_df)
|
||||
|
||||
@app_x.command ("has")
|
||||
def registry_has (alias:Annotated[str,typer.Argument(help="alias of a function function@file or file.function")],
|
||||
folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry file")) :
|
||||
_pregistry = pix.Registry(folder=folder)
|
||||
if _pregistry.has(alias) :
|
||||
_msg = f"{CHECK_MARK} {alias} was [bold] found [/bold] in registry "
|
||||
else:
|
||||
_msg = f"{TIMES_MARK} {alias} was [bold] NOT found [/bold] in registry "
|
||||
print (_msg)
|
||||
|
||||
@app_x.command(name="test")
|
||||
def registry_test (alias:Annotated[str,typer.Argument(help="alias of a function function@file or file.function")],
|
||||
folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry folder")) :
|
||||
_pregistry = pix.Registry(folder=folder)
|
||||
"""
|
||||
This function allows to test syntax for a plugin i.e in terms of alias@function
|
||||
"""
|
||||
# _item = transport.registry.plugins.has(key=key)
|
||||
_pointer = _pregistry.get(alias) if _pregistry.has(alias) else None
|
||||
|
||||
if _pointer:
|
||||
print (f"""{CHECK_MARK} successfully loaded [bold] {alias}[/bold] found in {folder}""")
|
||||
|
||||
else:
|
||||
print (f"{TIMES_MARK} unable to load {alias}. Make sure it is registered")
|
||||
app.add_typer(app_e,name='etl',help="This function will run etl or generate a template etl configuration file")
|
||||
app.add_typer(app_r,name='registry',help='This function allows labeling database access information')
|
||||
app.add_typer(app_i,name="info",help="This function will print either license or supported database technologies")
|
||||
app.add_typer(app_x, name="plugins",help="This function enables add/list/test of plugins in the registry")
|
||||
if __name__ == '__main__' :
|
||||
app()
|
||||
|
||||
|
@ -0,0 +1,23 @@
|
||||
__app_name__ = 'data-transport'
|
||||
__author__ = 'The Phi Technology'
|
||||
__version__= '2.2.22'
|
||||
__email__ = "info@the-phi.com"
|
||||
__edition__= 'community'
|
||||
__license__=f"""
|
||||
Copyright 2010 - 2024, Steve L. Nyemba
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
"""
|
||||
|
||||
__whatsnew__=f"""version {__version__},
|
||||
1. Added support for read/write logs as well as plugins (when applied)
|
||||
2. Bug fix with duckdb (adding readonly) for readers because there are issues with threads & processes
|
||||
3. support for streaming data, important to use this with large volumes of data
|
||||
|
||||
|
||||
"""
|
@ -0,0 +1,148 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Writing to Google Bigquery\n",
|
||||
"\n",
|
||||
"1. Insure you have a Google Bigquery service account key on disk\n",
|
||||
"2. The service key location is set as an environment variable **BQ_KEY**\n",
|
||||
"3. The dataset will be automatically created within the project associated with the service key\n",
|
||||
"\n",
|
||||
"The cell below creates a dataframe that will be stored within Google Bigquery"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 1/1 [00:00<00:00, 10106.76it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['data transport version ', '2.0.4']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#\n",
|
||||
"# Writing to Google Bigquery database\n",
|
||||
"#\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"PRIVATE_KEY = os.environ['BQ_KEY'] #-- location of the service key\n",
|
||||
"DATASET = 'demo'\n",
|
||||
"_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n",
|
||||
"bqw = transport.get.writer(provider=providers.BIGQUERY,dataset=DATASET,table='friends',private_key=PRIVATE_KEY)\n",
|
||||
"bqw.write(_data,if_exists='replace') #-- default is append\n",
|
||||
"print (['data transport version ', transport.__version__])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Reading from Google Bigquery\n",
|
||||
"\n",
|
||||
"The cell below reads the data that has been written by the cell above and computes the average age within a Google Bigquery (simple query). \n",
|
||||
"\n",
|
||||
"- Basic read of the designated table (friends) created above\n",
|
||||
"- Execute an aggregate SQL against the table\n",
|
||||
"\n",
|
||||
"**NOTE**\n",
|
||||
"\n",
|
||||
"By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n",
|
||||
"Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading: 100%|\u001b[32m██████████\u001b[0m|\n",
|
||||
"Downloading: 100%|\u001b[32m██████████\u001b[0m|\n",
|
||||
" name age\n",
|
||||
"0 James Bond 55\n",
|
||||
"1 Steve Rogers 150\n",
|
||||
"2 Steve Nyemba 44\n",
|
||||
"--------- STATISTICS ------------\n",
|
||||
" _counts f0_\n",
|
||||
"0 3 83.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import os\n",
|
||||
"PRIVATE_KEY=os.environ['BQ_KEY']\n",
|
||||
"pgr = transport.get.reader(provider=providers.BIGQUERY,dataset='demo',table='friends',private_key=PRIVATE_KEY)\n",
|
||||
"_df = pgr.read()\n",
|
||||
"_query = 'SELECT COUNT(*) _counts, AVG(age) from demo.friends'\n",
|
||||
"_sdf = pgr.read(sql=_query)\n",
|
||||
"print (_df)\n",
|
||||
"print ('--------- STATISTICS ------------')\n",
|
||||
"print (_sdf)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"An **auth-file** is a file that contains database parameters used to access the database. \n",
|
||||
"For code in shared environments, we recommend \n",
|
||||
"\n",
|
||||
"1. Having the **auth-file** stored on disk \n",
|
||||
"2. and the location of the file is set to an environment variable.\n",
|
||||
"\n",
|
||||
"To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -0,0 +1,188 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Extract Transform Load (ETL) from Code\n",
|
||||
"\n",
|
||||
"The example below reads data from an http source (github) and will copy the data to a csv file and to a database. This example illustrates the one-to-many ETL features.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>id</th>\n",
|
||||
" <th>location_id</th>\n",
|
||||
" <th>address_1</th>\n",
|
||||
" <th>address_2</th>\n",
|
||||
" <th>city</th>\n",
|
||||
" <th>state_province</th>\n",
|
||||
" <th>postal_code</th>\n",
|
||||
" <th>country</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>2600 Middlefield Road</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>Redwood City</td>\n",
|
||||
" <td>CA</td>\n",
|
||||
" <td>94063</td>\n",
|
||||
" <td>US</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>24 Second Avenue</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>San Mateo</td>\n",
|
||||
" <td>CA</td>\n",
|
||||
" <td>94401</td>\n",
|
||||
" <td>US</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>24 Second Avenue</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>San Mateo</td>\n",
|
||||
" <td>CA</td>\n",
|
||||
" <td>94403</td>\n",
|
||||
" <td>US</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>24 Second Avenue</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>San Mateo</td>\n",
|
||||
" <td>CA</td>\n",
|
||||
" <td>94401</td>\n",
|
||||
" <td>US</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>24 Second Avenue</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>San Mateo</td>\n",
|
||||
" <td>CA</td>\n",
|
||||
" <td>94401</td>\n",
|
||||
" <td>US</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" id location_id address_1 address_2 city \\\n",
|
||||
"0 1 1 2600 Middlefield Road NaN Redwood City \n",
|
||||
"1 2 2 24 Second Avenue NaN San Mateo \n",
|
||||
"2 3 3 24 Second Avenue NaN San Mateo \n",
|
||||
"3 4 4 24 Second Avenue NaN San Mateo \n",
|
||||
"4 5 5 24 Second Avenue NaN San Mateo \n",
|
||||
"\n",
|
||||
" state_province postal_code country \n",
|
||||
"0 CA 94063 US \n",
|
||||
"1 CA 94401 US \n",
|
||||
"2 CA 94403 US \n",
|
||||
"3 CA 94401 US \n",
|
||||
"4 CA 94401 US "
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#\n",
|
||||
"# Writing to Google Bigquery database\n",
|
||||
"#\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"#\n",
|
||||
"#\n",
|
||||
"source = {\"provider\": \"http\", \"url\": \"https://raw.githubusercontent.com/codeforamerica/ohana-api/master/data/sample-csv/addresses.csv\"}\n",
|
||||
"target = [{\"provider\": \"files\", \"path\": \"addresses.csv\", \"delimiter\": \",\"}, {\"provider\": \"sqlite\", \"database\": \"sample.db3\", \"table\": \"addresses\"}]\n",
|
||||
"\n",
|
||||
"_handler = transport.get.etl (source=source,target=target)\n",
|
||||
"_data = _handler.read() #-- all etl begins with data being read\n",
|
||||
"_data.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Extract Transform Load (ETL) from CLI\n",
|
||||
"\n",
|
||||
"The documentation for this is available at https://healthcareio.the-phi.com/data-transport \"Docs\" -> \"Terminal CLI\"\n",
|
||||
"\n",
|
||||
"The entire process is documented including how to generate an ETL configuration file."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -0,0 +1,138 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Writing to Apache Iceberg\n",
|
||||
"\n",
|
||||
"1. Insure you have a Google Bigquery service account key on disk\n",
|
||||
"2. The service key location is set as an environment variable **BQ_KEY**\n",
|
||||
"3. The dataset will be automatically created within the project associated with the service key\n",
|
||||
"\n",
|
||||
"The cell below creates a dataframe that will be stored within Google Bigquery"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['data transport version ', '2.4.0']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#\n",
|
||||
"# Writing to Google Bigquery database\n",
|
||||
"#\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"PRIVATE_KEY = os.environ['BQ_KEY'] #-- location of the service key\n",
|
||||
"DATASET = 'demo'\n",
|
||||
"_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n",
|
||||
"# bqw = transport.get.writer(provider=providers.ICEBERG,catalog='mz',database='edw.mz',table='friends')\n",
|
||||
"bqw = transport.get.writer(provider=providers.ICEBERG,table='edw.mz.friends')\n",
|
||||
"bqw.write(_data,if_exists='replace') #-- default is append\n",
|
||||
"print (['data transport version ', transport.__version__])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Reading from Google Bigquery\n",
|
||||
"\n",
|
||||
"The cell below reads the data that has been written by the cell above and computes the average age within a Google Bigquery (simple query). \n",
|
||||
"\n",
|
||||
"- Basic read of the designated table (friends) created above\n",
|
||||
"- Execute an aggregate SQL against the table\n",
|
||||
"\n",
|
||||
"**NOTE**\n",
|
||||
"\n",
|
||||
"By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n",
|
||||
"Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" name age\n",
|
||||
"0 James Bond 55\n",
|
||||
"1 Steve Rogers 150\n",
|
||||
"2 Steve Nyemba 44\n",
|
||||
"--------- STATISTICS ------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import os\n",
|
||||
"PRIVATE_KEY=os.environ['BQ_KEY']\n",
|
||||
"pgr = transport.get.reader(provider=providers.ICEBERG,database='edw.mz')\n",
|
||||
"_df = pgr.read(table='friends')\n",
|
||||
"_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n",
|
||||
"_sdf = pgr.read(sql=_query)\n",
|
||||
"print (_df)\n",
|
||||
"print ('--------- STATISTICS ------------')\n",
|
||||
"# print (_sdf)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"An **auth-file** is a file that contains database parameters used to access the database. \n",
|
||||
"For code in shared environments, we recommend \n",
|
||||
"\n",
|
||||
"1. Having the **auth-file** stored on disk \n",
|
||||
"2. and the location of the file is set to an environment variable.\n",
|
||||
"\n",
|
||||
"To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -0,0 +1,128 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Writing to mongodb\n",
|
||||
"\n",
|
||||
"Insure mongodb is actually installed on the system, The cell below creates a dataframe that will be stored within mongodb"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2.0.4\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#\n",
|
||||
"# Writing to mongodb database\n",
|
||||
"#\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import pandas as pd\n",
|
||||
"_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n",
|
||||
"mgw = transport.get.writer(provider=providers.MONGODB,db='demo',collection='friends')\n",
|
||||
"mgw.write(_data)\n",
|
||||
"print (transport.__version__)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Reading from mongodb\n",
|
||||
"\n",
|
||||
"The cell below reads the data that has been written by the cell above and computes the average age within a mongodb pipeline. The code in the background executes an aggregation using **db.runCommand**\n",
|
||||
"\n",
|
||||
"- Basic read of the designated collection **find=\\<collection>**\n",
|
||||
"- Executing an aggregate pipeline against a collection **aggreate=\\<collection>**\n",
|
||||
"\n",
|
||||
"**NOTE**\n",
|
||||
"\n",
|
||||
"By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n",
|
||||
"Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" name age\n",
|
||||
"0 James Bond 55\n",
|
||||
"1 Steve Rogers 150\n",
|
||||
"--------- STATISTICS ------------\n",
|
||||
" _id _counts _mean\n",
|
||||
"0 0 2 102.5\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"mgr = transport.get.reader(provider=providers.MONGODB,db='foo',collection='friends')\n",
|
||||
"_df = mgr.read()\n",
|
||||
"PIPELINE = [{\"$group\":{\"_id\":0,\"_counts\":{\"$sum\":1}, \"_mean\":{\"$avg\":\"$age\"}}}]\n",
|
||||
"_sdf = mgr.read(aggregate='friends',pipeline=PIPELINE)\n",
|
||||
"print (_df)\n",
|
||||
"print ('--------- STATISTICS ------------')\n",
|
||||
"print (_sdf)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"An **auth-file** is a file that contains database parameters used to access the database. \n",
|
||||
"For code in shared environments, we recommend \n",
|
||||
"\n",
|
||||
"1. Having the **auth-file** stored on disk \n",
|
||||
"2. and the location of the file is set to an environment variable.\n",
|
||||
"\n",
|
||||
"To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -0,0 +1,150 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Writing to Microsoft SQLServer\n",
|
||||
"\n",
|
||||
"1. Insure the Microsoft SQL Server is installed and you have access i.e account information\n",
|
||||
"2. The target database must be created before hand.\n",
|
||||
"3. We created an authentication file that will contain user account and location of the database\n",
|
||||
"\n",
|
||||
"The cell below creates a dataframe that will be stored in a Microsoft SQL Server database.\n",
|
||||
"\n",
|
||||
"**NOTE** This was not tested with a cloud instance"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#\n",
|
||||
"# Writing to Google Bigquery database\n",
|
||||
"#\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"AUTH_FOLDER = os.environ['DT_AUTH_FOLDER'] #-- location of the service key\n",
|
||||
"MSSQL_AUTH_FILE= os.sep.join([AUTH_FOLDER,'mssql.json'])\n",
|
||||
"\n",
|
||||
"_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n",
|
||||
"msw = transport.get.writer(provider=providers.MSSQL,table='friends',auth_file=MSSQL_AUTH_FILE)\n",
|
||||
"msw.write(_data,if_exists='replace') #-- default is append\n",
|
||||
"print (['data transport version ', transport.__version__])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Reading from Microsoft SQL Server database\n",
|
||||
"\n",
|
||||
"The cell below reads the data that has been written by the cell above and computes the average age within an MS SQL Server (simple query). \n",
|
||||
"\n",
|
||||
"- Basic read of the designated table (friends) created above\n",
|
||||
"- Execute an aggregate SQL against the table\n",
|
||||
"\n",
|
||||
"**NOTE**\n",
|
||||
"\n",
|
||||
"By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n",
|
||||
"Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import os\n",
|
||||
"AUTH_FOLDER = os.environ['DT_AUTH_FOLDER'] #-- location of the service key\n",
|
||||
"MSSQL_AUTH_FILE= os.sep.join([AUTH_FOLDER,'mssql.json'])\n",
|
||||
"\n",
|
||||
"msr = transport.get.reader(provider=providers.MSSQL,table='friends',auth_file=MSSQL_AUTH_FILE)\n",
|
||||
"_df = msr.read()\n",
|
||||
"_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n",
|
||||
"_sdf = msr.read(sql=_query)\n",
|
||||
"print (_df)\n",
|
||||
"print ('\\n--------- STATISTICS ------------\\n')\n",
|
||||
"print (_sdf)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"An **auth-file** is a file that contains database parameters used to access the database. \n",
|
||||
"For code in shared environments, we recommend \n",
|
||||
"\n",
|
||||
"1. Having the **auth-file** stored on disk \n",
|
||||
"2. and the location of the file is set to an environment variable.\n",
|
||||
"\n",
|
||||
"To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'provider': 'sqlserver',\n",
|
||||
" 'dataset': 'demo',\n",
|
||||
" 'table': 'friends',\n",
|
||||
" 'username': '<username>',\n",
|
||||
" 'password': '<password>'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"provider\":\"sqlserver\",\n",
|
||||
" \"dataset\":\"demo\",\"table\":\"friends\",\"username\":\"<username>\",\"password\":\"<password>\"\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -0,0 +1,161 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Writing to MySQL\n",
|
||||
"\n",
|
||||
"1. Insure MySQL is actually installed on the system, \n",
|
||||
"2. There is a database called demo created on the said system\n",
|
||||
"\n",
|
||||
"The cell below creates a dataframe that will be stored within postgreSQL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2.0.4\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#\n",
|
||||
"# Writing to PostgreSQL database\n",
|
||||
"#\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import pandas as pd\n",
|
||||
"_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n",
|
||||
"myw = transport.get.writer(provider=providers.MYSQL,database='demo',table='friends',auth_file=\"/home/steve/auth-mysql.json\")\n",
|
||||
"myw.write(_data,if_exists='replace') #-- default is append\n",
|
||||
"print (transport.__version__)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Reading from MySQL\n",
|
||||
"\n",
|
||||
"The cell below reads the data that has been written by the cell above and computes the average age within a MySQL (simple query). \n",
|
||||
"\n",
|
||||
"- Basic read of the designated table (friends) created above\n",
|
||||
"- Execute an aggregate SQL against the table\n",
|
||||
"\n",
|
||||
"**NOTE**\n",
|
||||
"\n",
|
||||
"By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n",
|
||||
"Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" name age\n",
|
||||
"0 James Bond 55\n",
|
||||
"1 Steve Rogers 150\n",
|
||||
"2 Steve Nyemba 44\n",
|
||||
"--------- STATISTICS ------------\n",
|
||||
" _counts AVG(age)\n",
|
||||
"0 3 83.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"myr = transport.get.reader(provider=providers.MYSQL,database='demo',table='friends',auth_file='/home/steve/auth-mysql.json')\n",
|
||||
"_df = myr.read()\n",
|
||||
"_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n",
|
||||
"_sdf = myr.read(sql=_query)\n",
|
||||
"print (_df)\n",
|
||||
"print ('--------- STATISTICS ------------')\n",
|
||||
"print (_sdf)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"An **auth-file** is a file that contains database parameters used to access the database. \n",
|
||||
"For code in shared environments, we recommend \n",
|
||||
"\n",
|
||||
"1. Having the **auth-file** stored on disk \n",
|
||||
"2. and the location of the file is set to an environment variable.\n",
|
||||
"\n",
|
||||
"To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'host': 'klingon.io',\n",
|
||||
" 'port': 3306,\n",
|
||||
" 'username': 'me',\n",
|
||||
" 'password': 'foobar',\n",
|
||||
" 'provider': 'mysql',\n",
|
||||
" 'database': 'demo',\n",
|
||||
" 'table': 'friends'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"{\n",
|
||||
" \"host\":\"klingon.io\",\"port\":3306,\"username\":\"me\",\"password\":\"foobar\", \"provider\":\"mysql\",\n",
|
||||
" \"database\":\"demo\",\"table\":\"friends\"\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -0,0 +1,149 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Writing data-transport plugins\n",
|
||||
"\n",
|
||||
"The data-transport plugins are designed to automate pre/post processing i.e\n",
|
||||
"\n",
|
||||
" - Read -> Post processing\n",
|
||||
" - Write-> Pre processing\n",
|
||||
" \n",
|
||||
"In this example we will assume, data and write both pre/post processing to any supported infrastructure. We will equally show how to specify the plugins within a configuration file"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#\n",
|
||||
"# Writing to Google Bigquery database\n",
|
||||
"#\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"import shutil\n",
|
||||
"#\n",
|
||||
"#\n",
|
||||
"\n",
|
||||
"DATABASE = '/home/steve/tmp/demo.db3'\n",
|
||||
"if os.path.exists(DATABASE) :\n",
|
||||
" os.remove(DATABASE)\n",
|
||||
"#\n",
|
||||
"# \n",
|
||||
"_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n",
|
||||
"litew = transport.get.writer(provider=providers.SQLITE,database=DATABASE)\n",
|
||||
"litew.write(_data,table='friends')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Reading from SQLite\n",
|
||||
"\n",
|
||||
"The cell below reads the data that has been written by the cell above and computes the average age from a plugin function we will write. \n",
|
||||
"\n",
|
||||
"- Basic read of the designated table (friends) created above\n",
|
||||
"- Read with pipeline functions defined in code\n",
|
||||
"\n",
|
||||
"**NOTE**\n",
|
||||
"\n",
|
||||
"It is possible to use **transport.factory.instance** or **transport.instance** or **transport.get.<[reader|writer]>** they are the same. It allows the maintainers to know that we used a factory design pattern."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" name age\n",
|
||||
"0 James Bond 55\n",
|
||||
"1 Steve Rogers 150\n",
|
||||
"2 Steve Nyemba 44\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" name age autoinc\n",
|
||||
"0 James Bond 5.5 0\n",
|
||||
"1 Steve Rogers 15.0 1\n",
|
||||
"2 Steve Nyemba 4.4 2\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import os\n",
|
||||
"import numpy as np\n",
|
||||
"def _autoincrement (_data,**kwargs) :\n",
|
||||
" \"\"\"\n",
|
||||
" This function will add an autoincrement field to the table\n",
|
||||
" \"\"\"\n",
|
||||
" _data['autoinc'] = np.arange(_data.shape[0])\n",
|
||||
" \n",
|
||||
" return _data\n",
|
||||
"def reduce(_data,**_args) :\n",
|
||||
" \"\"\"\n",
|
||||
" This function will reduce the age of the data frame\n",
|
||||
" \"\"\"\n",
|
||||
" _data.age /= 10\n",
|
||||
" return _data\n",
|
||||
"reader = transport.get.reader(provider=providers.SQLITE,database=DATABASE,table='friends')\n",
|
||||
"#\n",
|
||||
"# basic read of the data created in the first cell\n",
|
||||
"_df = reader.read()\n",
|
||||
"print (_df)\n",
|
||||
"print ()\n",
|
||||
"print()\n",
|
||||
"#\n",
|
||||
"# read of the data with pipeline function provided to alter the database\n",
|
||||
"print (reader.read(pipeline=[_autoincrement,reduce]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The parameters for instianciating a transport object (reader or writer) can be found at [data-transport home](https://healthcareio.the-phi.com/data-transport)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -0,0 +1,162 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Writing to PostgreSQL\n",
|
||||
"\n",
|
||||
"1. Insure PostgreSQL is actually installed on the system, \n",
|
||||
"2. There is a database called demo created on the said system\n",
|
||||
"\n",
|
||||
"The cell below creates a dataframe that will be stored within postgreSQL"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2.0.4\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#\n",
|
||||
"# Writing to PostgreSQL database\n",
|
||||
"#\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import pandas as pd\n",
|
||||
"_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n",
|
||||
"pgw = transport.get.writer(provider=providers.POSTGRESQL,database='demo',table='friends')\n",
|
||||
"pgw.write(_data,if_exists='replace') #-- default is append\n",
|
||||
"print (transport.__version__)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Reading from PostgreSQL\n",
|
||||
"\n",
|
||||
"The cell below reads the data that has been written by the cell above and computes the average age within a PostreSQL (simple query). \n",
|
||||
"\n",
|
||||
"- Basic read of the designated table (friends) created above\n",
|
||||
"- Execute an aggregate SQL against the table\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"**NOTE**\n",
|
||||
"\n",
|
||||
"By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n",
|
||||
"Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" name age\n",
|
||||
"0 James Bond 55\n",
|
||||
"1 Steve Rogers 150\n",
|
||||
"2 Steve Nyemba 44\n",
|
||||
"--------- STATISTICS ------------\n",
|
||||
" _counts avg\n",
|
||||
"0 3 83.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"pgr = transport.get.reader(provider=providers.POSTGRESQL,database='demo',table='friends')\n",
|
||||
"_df = pgr.read()\n",
|
||||
"_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n",
|
||||
"_sdf = pgr.read(sql=_query)\n",
|
||||
"print (_df)\n",
|
||||
"print ('--------- STATISTICS ------------')\n",
|
||||
"print (_sdf)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"An **auth-file** is a file that contains database parameters used to access the database. \n",
|
||||
"For code in shared environments, we recommend \n",
|
||||
"\n",
|
||||
"1. Having the **auth-file** stored on disk \n",
|
||||
"2. and the location of the file is set to an environment variable.\n",
|
||||
"\n",
|
||||
"To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'host': 'klingon.io',\n",
|
||||
" 'port': 5432,\n",
|
||||
" 'username': 'me',\n",
|
||||
" 'password': 'foobar',\n",
|
||||
" 'provider': 'postgresql',\n",
|
||||
" 'database': 'demo',\n",
|
||||
" 'table': 'friends'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"{\n",
|
||||
" \"host\":\"klingon.io\",\"port\":5432,\"username\":\"me\",\"password\":\"foobar\", \"provider\":\"postgresql\",\n",
|
||||
" \"database\":\"demo\",\"table\":\"friends\"\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -0,0 +1,131 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Writing to AWS S3\n",
|
||||
"\n",
|
||||
"We have setup our demo environment with the label **aws** passed to reference our s3 access_key and secret_key and file (called friends.csv). In the cell below we will write the data to our aws s3 bucket named **com.phi.demo**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2.2.1\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#\n",
|
||||
"# Writing to mongodb database\n",
|
||||
"#\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import pandas as pd\n",
|
||||
"_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n",
|
||||
"mgw = transport.get.writer(label='aws')\n",
|
||||
"mgw.write(_data)\n",
|
||||
"print (transport.__version__)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Reading from AWS S3\n",
|
||||
"\n",
|
||||
"The cell below reads the data that has been written by the cell above and computes the average age within a mongodb pipeline. The code in the background executes an aggregation using\n",
|
||||
"\n",
|
||||
"- Basic read of the designated file **friends.csv**\n",
|
||||
"- Compute average age using standard pandas functions\n",
|
||||
"\n",
|
||||
"**NOTE**\n",
|
||||
"\n",
|
||||
"By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n",
|
||||
"Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" bname age\n",
|
||||
"0 James Bond 55\n",
|
||||
"1 Steve Rogers 150\n",
|
||||
"2 Steve Nyemba 44\n",
|
||||
"--------- STATISTICS ------------\n",
|
||||
"83.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"def cast(stream) :\n",
|
||||
" print (stream)\n",
|
||||
" return pd.DataFrame(str(stream))\n",
|
||||
"mgr = transport.get.reader(label='aws')\n",
|
||||
"_df = mgr.read()\n",
|
||||
"print (_df)\n",
|
||||
"print ('--------- STATISTICS ------------')\n",
|
||||
"print (_df.age.mean())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"An **auth-file** is a file that contains database parameters used to access the database. \n",
|
||||
"For code in shared environments, we recommend \n",
|
||||
"\n",
|
||||
"1. Having the **auth-file** stored on disk \n",
|
||||
"2. and the location of the file is set to an environment variable.\n",
|
||||
"\n",
|
||||
"To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -0,0 +1,143 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Writing to SQLite3+\n",
|
||||
"\n",
|
||||
"The requirements to get started are minimal (actually none). The cell below creates a dataframe that will be stored within SQLite 3+"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2.0.4\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#\n",
|
||||
"# Writing to PostgreSQL database\n",
|
||||
"#\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"import pandas as pd\n",
|
||||
"_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n",
|
||||
"sqw = transport.get.writer(provider=providers.SQLITE,database='/home/steve/demo.db3',table='friends')\n",
|
||||
"sqw.write(_data,if_exists='replace') #-- default is append\n",
|
||||
"print (transport.__version__)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Reading from SQLite3+\n",
|
||||
"\n",
|
||||
"The cell below reads the data that has been written by the cell above and computes the average age within a PostreSQL (simple query). \n",
|
||||
"\n",
|
||||
"- Basic read of the designated table (friends) created above\n",
|
||||
"- Execute an aggregate SQL against the table\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"**NOTE**\n",
|
||||
"\n",
|
||||
"By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n",
|
||||
"Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" name age\n",
|
||||
"0 James Bond 55\n",
|
||||
"1 Steve Rogers 150\n",
|
||||
"2 Steve Nyemba 44\n",
|
||||
"--------- STATISTICS ------------\n",
|
||||
" _counts AVG(age)\n",
|
||||
"0 3 83.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"import transport\n",
|
||||
"from transport import providers\n",
|
||||
"sqr = transport.get.reader(provider=providers.SQLITE,database='/home/steve/demo.db3',table='friends')\n",
|
||||
"_df = sqr.read()\n",
|
||||
"_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n",
|
||||
"_sdf = sqr.read(sql=_query)\n",
|
||||
"print (_df)\n",
|
||||
"print ('--------- STATISTICS ------------')\n",
|
||||
"print (_sdf)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"An **auth-file** is a file that contains database parameters used to access the database. \n",
|
||||
"For code in shared environments, we recommend \n",
|
||||
"\n",
|
||||
"1. Having the **auth-file** stored on disk \n",
|
||||
"2. and the location of the file is set to an environment variable.\n",
|
||||
"\n",
|
||||
"To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"{\n",
|
||||
" \"provider\":\"sqlite\",\n",
|
||||
" \"database\":\"/home/steve/demo.db3\",\"table\":\"friends\"\n",
|
||||
"}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -0,0 +1,54 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "data-transport"
|
||||
dynamic = ["version"]
|
||||
authors = [
|
||||
{name="Steve L. Nyemba" , email = "info@the-phi.com"},
|
||||
]
|
||||
description = ""
|
||||
readme = "README.md"
|
||||
license = {text = "LICENSE"}
|
||||
keywords = ["mongodb","duckdb","couchdb","rabbitmq","file","read","write","s3","sqlite"]
|
||||
classifiers = [
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Topic :: Utilities",
|
||||
]
|
||||
dependencies = [
|
||||
"termcolor","sqlalchemy", "aiosqlite","duckdb-engine",
|
||||
"mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite",
|
||||
"typer","pandas","numpy","sqlalchemy","pyarrow","smart-open",
|
||||
"plugin-ix@git+https://github.com/lnyemba/plugins-ix"
|
||||
]
|
||||
[project.optional-dependencies]
|
||||
#sql = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite"]
|
||||
nosql = ["pymongo","cloudant"]
|
||||
cloud = ["boto","boto3","botocore","pyncclient","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore"]
|
||||
warehouse = ["pydrill","pyspark","sqlalchemy_drill"]
|
||||
other = ["pika","flask-session"]
|
||||
all = ["pymongo","cloudant","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore","pydrill","pyspark","sqlalchemy_drill", "pika","aiosqlite","boto3","boto","botocore", "pyncclient"]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://healthcareio.the-phi.com/git/code/transport.git"
|
||||
|
||||
#[project.scripts]
|
||||
#transport = "transport:main"
|
||||
|
||||
[tool.setuptools]
|
||||
include-package-data = true
|
||||
zip-safe = false
|
||||
script-files = ["bin/transport"]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["info","info.*", "transport", "transport.*"]
|
||||
|
||||
[tool.setuptools.dynamic]
|
||||
version = {attr = "info.__version__"}
|
||||
#authors = {attr = "meta.__author__"}
|
||||
|
||||
# If you have a info.py file, you might also want to include the author dynamically:
|
||||
# [tool.setuptools.dynamic]
|
||||
# version = {attr = "info.__version__"}
|
||||
# authors = {attr = "info.__author__"}
|
@ -0,0 +1,35 @@
|
||||
asn1crypto==0.23.0
|
||||
boto==2.48.0
|
||||
boto3==1.4.7
|
||||
botocore==1.7.17
|
||||
bz2file==0.98
|
||||
certifi==2017.7.27.1
|
||||
cffi==1.11.0
|
||||
chardet==3.0.4
|
||||
click==6.7
|
||||
couchdbkit==0.6.5
|
||||
cryptography==2.0.3
|
||||
docutils==0.14
|
||||
enum34==1.1.6
|
||||
Flask==0.12.2
|
||||
futures==3.1.1
|
||||
http-parser==0.8.3
|
||||
idna==2.6
|
||||
ipaddress==1.0.18
|
||||
itsdangerous==0.24
|
||||
Jinja2==2.9.6
|
||||
jmespath==0.9.3
|
||||
MarkupSafe==1.0
|
||||
numpy==1.13.1
|
||||
pika==0.11.0
|
||||
pycparser==2.18
|
||||
pyOpenSSL==17.3.0
|
||||
python-dateutil==2.6.1
|
||||
requests==2.18.4
|
||||
restkit==4.2.2
|
||||
s3transfer==0.1.11
|
||||
six==1.11.0
|
||||
smart-open==1.5.3
|
||||
socketpool==0.5.3
|
||||
urllib3==1.22
|
||||
Werkzeug==0.12.2
|
@ -0,0 +1,235 @@
|
||||
"""
|
||||
Data Transport, The Phi Technology LLC
|
||||
Steve L. Nyemba, steve@the-phi.com
|
||||
|
||||
This library is designed to serve as a wrapper to a set of supported data stores :
|
||||
- couchdb
|
||||
- mongodb
|
||||
- Files (character delimited)
|
||||
- Queues (RabbmitMq)
|
||||
- Session (Flask)
|
||||
- s3
|
||||
- sqlite
|
||||
The supported operations are read/write and providing meta data to the calling code
|
||||
We separated reads from writes to mitigate accidents associated with writes.
|
||||
Source Code is available under MIT License:
|
||||
https://healthcareio.the-phi.com/data-transport
|
||||
https://hiplab.mc.vanderbilt.edu/git/hiplab/data-transport
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
#from transport import sql, nosql, cloud, other, warehouse
|
||||
from transport import sql
|
||||
try:
|
||||
from transport import nosql
|
||||
except Exception as e:
|
||||
nosql = {}
|
||||
try:
|
||||
from transport import cloud
|
||||
except Exception as e:
|
||||
cloud = {}
|
||||
try:
|
||||
from transport import warehouse
|
||||
except Exception as e:
|
||||
warehouse = {}
|
||||
try:
|
||||
from transport import other
|
||||
except Exception as e:
|
||||
other = {}
|
||||
|
||||
|
||||
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
from info import __version__,__author__,__email__,__license__,__app_name__,__whatsnew__,__edition__
|
||||
from transport.iowrapper import IWriter, IReader, IETL
|
||||
from transport.plugins import PluginLoader
|
||||
from transport import providers
|
||||
import copy
|
||||
from transport import registry
|
||||
from transport.plugins import Plugin
|
||||
PROVIDERS = {}
|
||||
|
||||
def init():
|
||||
global PROVIDERS
|
||||
for _module in [cloud,sql,nosql,other,warehouse] :
|
||||
for _provider_name in dir(_module) :
|
||||
if _provider_name.startswith('__') or _provider_name == 'common' or type(_module) in [None,str,dict]:
|
||||
continue
|
||||
PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__}
|
||||
#
|
||||
# loading the registry
|
||||
if not registry.isloaded() :
|
||||
registry.load()
|
||||
|
||||
# def _getauthfile (path) :
|
||||
# f = open(path)
|
||||
# _object = json.loads(f.read())
|
||||
# f.close()
|
||||
# return _object
|
||||
def instance (**_args):
|
||||
"""
|
||||
This function returns an object of to read or write from a supported database provider/vendor
|
||||
@provider provider
|
||||
@context read/write (default is read)
|
||||
@auth_file: Optional if the database information provided is in a file. Useful for not sharing passwords
|
||||
kwargs These are arguments that are provider/vendor specific
|
||||
"""
|
||||
global PROVIDERS
|
||||
|
||||
if 'auth_file' in _args:
|
||||
if os.path.exists(_args['auth_file']) :
|
||||
#
|
||||
# @TODO: add encryption module and decryption to enable this to be secure
|
||||
#
|
||||
|
||||
f = open(_args['auth_file'])
|
||||
#_args = dict (_args,** json.loads(f.read()) )
|
||||
#
|
||||
# we overrite file parameters with arguments passed
|
||||
_args = dict (json.loads(f.read()),**_args )
|
||||
f.close()
|
||||
else:
|
||||
filename = _args['auth_file']
|
||||
raise Exception(f" {filename} was not found or is invalid")
|
||||
if 'provider' not in _args and 'auth_file' not in _args :
|
||||
if not registry.isloaded () :
|
||||
if ('path' in _args and registry.exists(_args['path'] )) or registry.exists():
|
||||
registry.load() if 'path' not in _args else registry.load(_args['path'])
|
||||
_info = {}
|
||||
if 'label' in _args and registry.isloaded():
|
||||
_info = registry.get(_args['label'])
|
||||
else:
|
||||
_info = registry.get()
|
||||
if _info :
|
||||
_args = dict(_info,**_args) #-- we can override the registry parameters with our own arguments
|
||||
|
||||
if 'provider' in _args and _args['provider'] in PROVIDERS :
|
||||
_info = PROVIDERS[_args['provider']]
|
||||
_module = _info['module']
|
||||
if 'context' in _args :
|
||||
_context = _args['context']
|
||||
else:
|
||||
_context = 'read'
|
||||
_pointer = getattr(_module,'Reader') if _context == 'read' else getattr(_module,'Writer')
|
||||
_agent = _pointer (**_args)
|
||||
#
|
||||
loader = None
|
||||
|
||||
#
|
||||
# @TODO:
|
||||
# define a logger object here that will used by the wrapper
|
||||
# this would allow us to know what the data-transport is doing and where/how it fails
|
||||
#
|
||||
|
||||
# if 'plugins' in _args :
|
||||
# _params = _args['plugins']
|
||||
|
||||
# if 'path' in _params and 'names' in _params :
|
||||
# loader = PluginLoader(**_params)
|
||||
# elif type(_params) == list:
|
||||
# loader = PluginLoader()
|
||||
# for _delegate in _params :
|
||||
# loader.set(_delegate)
|
||||
|
||||
_plugins = None if 'plugins' not in _args else _args['plugins']
|
||||
|
||||
# if registry.has('logger') :
|
||||
# _kwa = registry.get('logger')
|
||||
# _lmodule = getPROVIDERS[_kwa['provider']]
|
||||
|
||||
if ( ('label' in _args and _args['label'] != 'logger') and registry.has('logger')):
|
||||
#
|
||||
# We did not request label called logger, so we are setting up a logger if it is specified in the registry
|
||||
#
|
||||
_kwargs = registry.get('logger')
|
||||
_kwargs['context'] = 'write'
|
||||
_kwargs['table'] =_module.__name__.split('.')[-1]+'_logs'
|
||||
# _logger = instance(**_kwargs)
|
||||
_module = PROVIDERS[_kwargs['provider']]['module']
|
||||
_logger = getattr(_module,'Writer')
|
||||
_logger = _logger(**_kwargs)
|
||||
else:
|
||||
_logger = None
|
||||
|
||||
_kwargs = {'agent':_agent,'plugins':_plugins,'logger':_logger}
|
||||
if 'args' in _args :
|
||||
_kwargs['args'] = _args['args']
|
||||
# _datatransport = IReader(_agent,_plugins,_logger) if _context == 'read' else IWriter(_agent,_plugins,_logger)
|
||||
_datatransport = IReader(**_kwargs) if _context == 'read' else IWriter(**_kwargs)
|
||||
return _datatransport
|
||||
|
||||
else:
|
||||
#
|
||||
# We can handle the case for an ETL object
|
||||
#
|
||||
raise Exception ("Missing or Unknown provider")
|
||||
pass
|
||||
class get :
|
||||
"""
|
||||
This class is just a wrapper to make the interface (API) more conversational and easy to understand
|
||||
"""
|
||||
@staticmethod
|
||||
def reader (**_args):
|
||||
if not _args or ('provider' not in _args and 'label' not in _args):
|
||||
_args['label'] = 'default'
|
||||
_args['context'] = 'read'
|
||||
# return instance(**_args)
|
||||
# _args['logger'] = instance(**{'label':'logger','context':'write','table':'logs'})
|
||||
|
||||
_handler = instance(**_args)
|
||||
# _handler.setLogger(get.logger())
|
||||
return _handler
|
||||
|
||||
|
||||
@staticmethod
|
||||
def writer(**_args):
|
||||
"""
|
||||
This function is a wrapper that will return a writer to a database. It disambiguates the interface
|
||||
"""
|
||||
if not _args or ('provider' not in _args and 'label' not in _args):
|
||||
_args['label'] = 'default'
|
||||
_args['context'] = 'write'
|
||||
# _args['logger'] = instance(**{'label':'logger','context':'write','table':'logs'})
|
||||
|
||||
_handler = instance(**_args)
|
||||
#
|
||||
# Implementing logging with the 'eat-your-own-dog-food' approach
|
||||
# Using dependency injection to set the logger (problem with imports)
|
||||
#
|
||||
# _handler.setLogger(get.logger())
|
||||
return _handler
|
||||
@staticmethod
|
||||
def logger ():
|
||||
if registry.has('logger') :
|
||||
_args = registry.get('logger')
|
||||
_args['context'] = 'write'
|
||||
return instance(**_args)
|
||||
return None
|
||||
@staticmethod
|
||||
def etl (**_args):
|
||||
if 'source' in _args and 'target' in _args :
|
||||
|
||||
return IETL(**_args)
|
||||
else:
|
||||
raise Exception ("Malformed input found, object must have both 'source' and 'target' attributes")
|
||||
|
||||
def supported ():
|
||||
_info = {}
|
||||
for _provider in PROVIDERS :
|
||||
_item = PROVIDERS[_provider]
|
||||
if _item['type'] not in _info :
|
||||
_info[_item['type']] = []
|
||||
_info[_item['type']].append(_provider)
|
||||
_df = pd.DataFrame()
|
||||
for _id in _info :
|
||||
if not _df.shape[0] :
|
||||
_df = pd.DataFrame(_info[_id],columns=[_id.replace('transport.','')])
|
||||
else:
|
||||
_df = pd.DataFrame(_info[_id],columns=[_id.replace('transport.','')]).join(_df, how='outer')
|
||||
return _df.fillna('')
|
||||
class factory :
|
||||
pass
|
||||
factory.instance = instance
|
||||
init()
|
@ -0,0 +1,6 @@
|
||||
"""
|
||||
Steve L. Nyemba, nyemba@gmail.com
|
||||
This namespace implements support for cloud databases databricks,bigquery ...
|
||||
"""
|
||||
from . import bigquery, databricks, nextcloud, s3
|
||||
|
@ -0,0 +1,159 @@
|
||||
"""
|
||||
Implementing support for google's bigquery
|
||||
- cloud.bigquery.Read
|
||||
- cloud.bigquery.Write
|
||||
"""
|
||||
import json
|
||||
from google.oauth2 import service_account
|
||||
from google.cloud import bigquery as bq
|
||||
|
||||
from multiprocessing import Lock, RLock
|
||||
import pandas as pd
|
||||
import pandas_gbq as pd_gbq
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
MAX_CHUNK = 2000000
|
||||
class BigQuery:
|
||||
def __init__(self,**_args):
|
||||
path = _args['service_key'] if 'service_key' in _args else _args['private_key']
|
||||
self.credentials = service_account.Credentials.from_service_account_file(path)
|
||||
self.dataset = _args['dataset'] if 'dataset' in _args else None
|
||||
self.path = path
|
||||
self.dtypes = _args['dtypes'] if 'dtypes' in _args else None
|
||||
self.table = _args['table'] if 'table' in _args else None
|
||||
self.client = bq.Client.from_service_account_json(self.path)
|
||||
def meta(self,**_args):
|
||||
"""
|
||||
This function returns meta data for a given table or query with dataset/table properly formatted
|
||||
:param table name of the name WITHOUT including dataset
|
||||
:param sql sql query to be pulled,
|
||||
"""
|
||||
table = _args['table'] if 'table' in _args else self.table
|
||||
|
||||
try:
|
||||
if table :
|
||||
_dataset = self.dataset if 'dataset' not in _args else _args['dataset']
|
||||
sql = f"""SELECT column_name as name, data_type as type FROM {_dataset}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{table}' """
|
||||
_info = {'credentials':self.credentials,'dialect':'standard'}
|
||||
return pd_gbq.read_gbq(sql,**_info).to_dict(orient='records')
|
||||
# return self.read(sql=sql).to_dict(orient='records')
|
||||
# ref = self.client.dataset(self.dataset).table(table)
|
||||
|
||||
# _schema = self.client.get_table(ref).schema
|
||||
# return [{"name":_item.name,"type":_item.field_type,"description":( "" if not hasattr(_item,"description") else _item.description )} for _item in _schema]
|
||||
else :
|
||||
return []
|
||||
except Exception as e:
|
||||
|
||||
return []
|
||||
def has(self,**_args):
|
||||
found = False
|
||||
try:
|
||||
_has = self.meta(**_args)
|
||||
found = _has is not None and len(_has) > 0
|
||||
except Exception as e:
|
||||
pass
|
||||
return found
|
||||
class Reader (BigQuery):
|
||||
"""
|
||||
Implementing support for reading from bigquery, This class acts as a wrapper around google's API
|
||||
"""
|
||||
def __init__(self,**_args):
|
||||
|
||||
super().__init__(**_args)
|
||||
def apply(self,sql):
|
||||
return self.read(sql=sql)
|
||||
|
||||
def read(self,**_args):
|
||||
SQL = None
|
||||
table = self.table if 'table' not in _args else _args['table']
|
||||
if 'sql' in _args :
|
||||
SQL = _args['sql']
|
||||
elif table:
|
||||
|
||||
table = "".join(["`",table,"`"]) if '.' in table else "".join(["`:dataset.",table,"`"])
|
||||
SQL = "SELECT * FROM :table ".replace(":table",table)
|
||||
if not SQL :
|
||||
return None
|
||||
if SQL and 'limit' in _args:
|
||||
SQL += " LIMIT "+str(_args['limit'])
|
||||
if (':dataset' in SQL or ':DATASET' in SQL) and self.dataset:
|
||||
SQL = SQL.replace(':dataset',self.dataset).replace(':DATASET',self.dataset)
|
||||
_info = {'credentials':self.credentials,'dialect':'standard'}
|
||||
return pd_gbq.read_gbq(SQL,**_info) if SQL else None
|
||||
# return self.client.query(SQL).to_dataframe() if SQL else None
|
||||
|
||||
class Writer (BigQuery):
|
||||
"""
|
||||
This class implements support for writing against bigquery
|
||||
"""
|
||||
lock = RLock()
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
|
||||
self.parallel = False if 'lock' not in _args else _args['lock']
|
||||
self.table = _args['table'] if 'table' in _args else None
|
||||
self.mode = {'if_exists':'append','chunksize':900000,'destination_table':self.table,'credentials':self.credentials}
|
||||
self._chunks = 1 if 'chunks' not in _args else int(_args['chunks'])
|
||||
self._location = 'US' if 'location' not in _args else _args['location']
|
||||
def write(self,_data,**_args) :
|
||||
"""
|
||||
This function will perform a write to bigquery
|
||||
:_data data-frame to be written to bigquery
|
||||
"""
|
||||
try:
|
||||
if self.parallel or 'lock' in _args :
|
||||
Writer.lock.acquire()
|
||||
_args['table'] = self.table if 'table' not in _args else _args['table']
|
||||
self._write(_data,**_args)
|
||||
finally:
|
||||
if self.parallel:
|
||||
Writer.lock.release()
|
||||
def submit(self,_sql):
|
||||
"""
|
||||
Write the output of a massive query to a given table, biquery will handle this as a job
|
||||
This function will return the job identifier
|
||||
"""
|
||||
_config = bq.QueryJobConfig()
|
||||
_config.destination = self.client.dataset(self.dataset).table(self.table)
|
||||
_config.allow_large_results = True
|
||||
# _config.write_disposition = bq.bq_consts.WRITE_APPEND
|
||||
_config.dry_run = False
|
||||
# _config.priority = 'BATCH'
|
||||
_resp = self.client.query(_sql,location=self._location,job_config=_config)
|
||||
return _resp.job_id
|
||||
def status (self,_id):
|
||||
return self.client.get_job(_id,location=self._location)
|
||||
def _write(self,_info,**_args) :
|
||||
_df = None
|
||||
if type(_info) in [list,pd.DataFrame] :
|
||||
if type(_info) == list :
|
||||
_df = pd.DataFrame(_info)
|
||||
elif type(_info) == pd.DataFrame :
|
||||
_df = _info
|
||||
|
||||
if '.' not in _args['table'] :
|
||||
self.mode['destination_table'] = '.'.join([self.dataset,_args['table']])
|
||||
else:
|
||||
|
||||
self.mode['destination_table'] = _args['table'].strip()
|
||||
if 'schema' in _args :
|
||||
self.mode['table_schema'] = _args['schema']
|
||||
#
|
||||
# Let us insure that the types are somewhat compatible ...
|
||||
# _map = {'INTEGER':np.int64,'DATETIME':'datetime64[ns]','TIMESTAMP':'datetime64[ns]','FLOAT':np.float64,'DOUBLE':np.float64,'STRING':str}
|
||||
# _mode = copy.deepcopy(self.mode)
|
||||
# _mode = self.mode
|
||||
# _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)
|
||||
#
|
||||
# Let us adjust the chunking here
|
||||
if 'if_exists' in _args :
|
||||
self.mode['if_exists'] = _args['if_exists']
|
||||
self._chunks = 10 if _df.shape[0] > MAX_CHUNK and self._chunks == 1 else self._chunks
|
||||
_indexes = np.array_split(np.arange(_df.shape[0]),self._chunks)
|
||||
for i in _indexes :
|
||||
# _df.iloc[i].to_gbq(**self.mode)
|
||||
pd_gbq.to_gbq(_df.iloc[i],**self.mode)
|
||||
time.sleep(1)
|
||||
pass
|
@ -0,0 +1,111 @@
|
||||
"""
|
||||
This file implements databricks handling, This functionality will rely on databricks-sql-connector
|
||||
LICENSE (MIT)
|
||||
Copyright 2016-2020, The Phi Technology LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
|
||||
@TODO:
|
||||
- Migrate SQLite to SQL hierarchy
|
||||
- Include Write in Chunks from pandas
|
||||
"""
|
||||
import os
|
||||
import sqlalchemy
|
||||
# from transport.common import Reader,Writer
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class Bricks:
|
||||
"""
|
||||
:host
|
||||
:token
|
||||
:database
|
||||
:cluster_path
|
||||
:table
|
||||
"""
|
||||
def __init__(self,**_args):
|
||||
_host = _args['host']
|
||||
_token= _args['token']
|
||||
_cluster_path = _args['cluster_path']
|
||||
self._schema = _args['schema'] if 'schema' in _args else _args['database']
|
||||
_catalog = _args['catalog']
|
||||
self._table = _args['table'] if 'table' in _args else None
|
||||
|
||||
#
|
||||
# @TODO:
|
||||
# Sometimes when the cluster isn't up and running it takes a while, the user should be alerted of this
|
||||
#
|
||||
|
||||
_uri = f'''databricks+connector://token:{_token}@{_host}?http_path={_cluster_path}&catalog={_catalog}&schema={self._schema}'''
|
||||
self._engine = sqlalchemy.create_engine (_uri)
|
||||
pass
|
||||
def meta(self,**_args):
|
||||
table = _args['table'] if 'table' in _args else self._table
|
||||
if not table :
|
||||
return []
|
||||
else:
|
||||
if sqlalchemy.__version__.startswith('1.') :
|
||||
_m = sqlalchemy.MetaData(bind=self._engine)
|
||||
_m.reflect(only=[table])
|
||||
else:
|
||||
_m = sqlalchemy.MetaData()
|
||||
_m.reflect(bind=self._engine)
|
||||
#
|
||||
# Let's retrieve te information associated with a table
|
||||
#
|
||||
return [{'name':_attr.name,'type':_attr.type} for _attr in _m.tables[table].columns]
|
||||
|
||||
def has(self,**_args):
|
||||
return self.meta(**_args)
|
||||
def apply(self,_sql):
|
||||
try:
|
||||
if _sql.lower().startswith('select') :
|
||||
return pd.read_sql(_sql,self._engine)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
class Reader(Bricks):
|
||||
"""
|
||||
This class is designed for reads and will execute reads against a table name or a select SQL statement
|
||||
"""
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
def read(self,**_args):
|
||||
limit = None if 'limit' not in _args else str(_args['limit'])
|
||||
|
||||
if 'sql' in _args :
|
||||
sql = _args['sql']
|
||||
elif 'table' in _args :
|
||||
table = _args['table']
|
||||
sql = f'SELECT * FROM {table}'
|
||||
if limit :
|
||||
sql = sql + f' LIMIT {limit}'
|
||||
|
||||
if 'sql' in _args or 'table' in _args :
|
||||
return self.apply(sql)
|
||||
else:
|
||||
return pd.DataFrame()
|
||||
pass
|
||||
class Writer(Bricks):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
def write(self,_data,**_args):
|
||||
"""
|
||||
This data will write data to data-bricks against a given table. If the table is not specified upon initiazation, it can be specified here
|
||||
_data: data frame to push to databricks
|
||||
_args: chunks, table, schema
|
||||
"""
|
||||
_schema = self._schema if 'schema' not in _args else _args['schema']
|
||||
_table = self._table if 'table' not in _args else _args['table']
|
||||
_df = _data if type(_data) == pd.DataFrame else _data
|
||||
if type(_df) == dict :
|
||||
_df = [_df]
|
||||
if type(_df) == list :
|
||||
_df = pd.DataFrame(_df)
|
||||
_df.to_sql(
|
||||
name=_table,schema=_schema,
|
||||
con=self._engine,if_exists='append',index=False);
|
||||
pass
|
@ -0,0 +1,80 @@
|
||||
"""
|
||||
We are implementing transport to and from nextcloud (just like s3)
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
from transport.common import IEncoder
|
||||
import pandas as pd
|
||||
from io import StringIO
|
||||
import json
|
||||
import nextcloud_client as nextcloud
|
||||
|
||||
class Nextcloud :
|
||||
def __init__(self,**_args):
|
||||
pass
|
||||
self._delimiter = None
|
||||
self._handler = nextcloud.Client(_args['url'])
|
||||
_uid = _args['uid']
|
||||
_token = _args['token']
|
||||
self._uri = _args['folder'] if 'folder' in _args else './'
|
||||
if self._uri.endswith('/') :
|
||||
self._uri = self._uri[:-1]
|
||||
self._file = None if 'file' not in _args else _args['file']
|
||||
self._handler.login(_uid,_token)
|
||||
def close(self):
|
||||
try:
|
||||
self._handler.logout()
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
|
||||
class Reader(Nextcloud):
|
||||
def __init__(self,**_args):
|
||||
# self._file = [] if 'file' not in _args else _args['file']
|
||||
super().__init__(**_args)
|
||||
pass
|
||||
def read(self,**_args):
|
||||
_filename = self._file if 'file' not in _args else _args['file']
|
||||
#
|
||||
# @TODO: if _filename is none, an exception should be raised
|
||||
#
|
||||
_uri = '/'.join([self._uri,_filename])
|
||||
if self._handler.get_file(_uri) :
|
||||
#
|
||||
#
|
||||
_info = self._handler.file_info(_uri)
|
||||
_content = self._handler.get_file_contents(_uri).decode('utf8')
|
||||
if _info.get_content_type() == 'text/csv' :
|
||||
#
|
||||
# @TODO: enable handling of csv, xls, parquet, pickles
|
||||
_file = StringIO(_content)
|
||||
return pd.read_csv(_file)
|
||||
else:
|
||||
#
|
||||
# if it is neither a structured document like csv, we will return the content as is
|
||||
return _content
|
||||
return None
|
||||
class Writer (Nextcloud):
|
||||
"""
|
||||
This class will write data to an instance of nextcloud
|
||||
"""
|
||||
def __init__(self,**_args) :
|
||||
super().__init__(**_args)
|
||||
self
|
||||
def write(self,_data,**_args):
|
||||
"""
|
||||
This function will upload a file to a given destination
|
||||
:file has the uri of the location of the file
|
||||
"""
|
||||
_filename = self._file if 'file' not in _args else _args['file']
|
||||
_uri = '/'.join([self._uri,_filename])
|
||||
if type(_data) == pd.DataFrame :
|
||||
f = StringIO()
|
||||
_data.to_csv(f,index=False)
|
||||
_content = f.getvalue()
|
||||
elif type(_data) == dict :
|
||||
_content = json.dumps(_data,cls=IEncoder)
|
||||
else:
|
||||
_content = str(_data)
|
||||
self._handler.put_file_contents(_uri,_content)
|
||||
|
@ -0,0 +1,137 @@
|
||||
"""
|
||||
Data Transport - 1.0
|
||||
Steve L. Nyemba, The Phi Technology LLC
|
||||
|
||||
This file is a wrapper around s3 bucket provided by AWS for reading and writing content
|
||||
TODO:
|
||||
- Address limitations that will properly read csv if it is stored with content type text/csv
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
import boto3
|
||||
# from boto.s3.connection import S3Connection, OrdinaryCallingFormat
|
||||
import numpy as np
|
||||
import botocore
|
||||
from smart_open import smart_open
|
||||
import sys
|
||||
|
||||
import json
|
||||
from io import StringIO
|
||||
import pandas as pd
|
||||
import json
|
||||
|
||||
class s3 :
|
||||
"""
|
||||
@TODO: Implement a search function for a file given a bucket??
|
||||
"""
|
||||
def __init__(self,**args) :
|
||||
"""
|
||||
This function will extract a file or set of files from s3 bucket provided
|
||||
@param access_key
|
||||
@param secret_key
|
||||
@param path location of the file
|
||||
@param filter filename or filtering elements
|
||||
"""
|
||||
try:
|
||||
self._client = boto3.client('s3',aws_access_key_id=args['access_key'],aws_secret_access_key=args['secret_key'],region_name=args['region'])
|
||||
self._bucket_name = args['bucket']
|
||||
self._file_name = args['file']
|
||||
self._region = args['region']
|
||||
except Exception as e :
|
||||
print (e)
|
||||
pass
|
||||
def has(self,**_args):
|
||||
_found = None
|
||||
try:
|
||||
if 'file' in _args and 'bucket' in _args:
|
||||
_found = self.meta(**_args)
|
||||
elif 'bucket' in _args and not 'file' in _args:
|
||||
_found = self._client.list_objects(Bucket=_args['bucket'])
|
||||
elif 'file' in _args and not 'bucket' in _args :
|
||||
_found = self.meta(bucket=self._bucket_name,file = _args['file'])
|
||||
except Exception as e:
|
||||
_found = None
|
||||
pass
|
||||
return type(_found) == dict
|
||||
def meta(self,**args):
|
||||
"""
|
||||
This function will return information either about the file in a given bucket
|
||||
:name name of the bucket
|
||||
"""
|
||||
_bucket = self._bucket_name if 'bucket' not in args else args['bucket']
|
||||
_file = self._file_name if 'file' not in args else args['file']
|
||||
_data = self._client.get_object(Bucket=_bucket,Key=_file)
|
||||
return _data['ResponseMetadata']
|
||||
def close(self):
|
||||
self._client.close()
|
||||
|
||||
class Reader(s3) :
|
||||
"""
|
||||
Because s3 contains buckets and files, reading becomes a tricky proposition :
|
||||
- list files if file is None
|
||||
- stream content if file is Not None
|
||||
@TODO: support read from all buckets, think about it
|
||||
"""
|
||||
def __init__(self,**_args) :
|
||||
super().__init__(**_args)
|
||||
|
||||
def _stream(self,**_args):
|
||||
"""
|
||||
At this point we should stream a file from a given bucket
|
||||
"""
|
||||
_object = self._client.get_object(Bucket=_args['bucket'],Key=_args['file'])
|
||||
_stream = None
|
||||
try:
|
||||
_stream = _object['Body'].read()
|
||||
except Exception as e:
|
||||
pass
|
||||
if not _stream :
|
||||
return None
|
||||
if _object['ContentType'] in ['text/csv'] :
|
||||
return pd.read_csv(StringIO(str(_stream).replace("\\n","\n").replace("\\r","").replace("\'","")))
|
||||
else:
|
||||
return _stream
|
||||
|
||||
def read(self,**args) :
|
||||
|
||||
_name = self._file_name if 'file' not in args else args['file']
|
||||
_bucket = args['bucket'] if 'bucket' in args else self._bucket_name
|
||||
return self._stream(bucket=_bucket,file=_name)
|
||||
|
||||
|
||||
class Writer(s3) :
|
||||
"""
|
||||
|
||||
"""
|
||||
def __init__(self,**_args) :
|
||||
super().__init__(**_args)
|
||||
#
|
||||
#
|
||||
if not self.has(bucket=self._bucket_name) :
|
||||
self.make_bucket(self._bucket_name)
|
||||
def make_bucket(self,bucket_name):
|
||||
"""
|
||||
This function will create a folder in a bucket,It is best that the bucket is organized as a namespace
|
||||
:name name of the folder
|
||||
"""
|
||||
|
||||
self._client.create_bucket(Bucket=bucket_name,CreateBucketConfiguration={'LocationConstraint': self._region})
|
||||
def write(self,_data,**_args):
|
||||
"""
|
||||
This function will write the data to the s3 bucket, files can be either csv, or json formatted files
|
||||
"""
|
||||
content = 'text/plain'
|
||||
if type(_data) == pd.DataFrame :
|
||||
_stream = _data.to_csv(index=False)
|
||||
content = 'text/csv'
|
||||
elif type(_data) == dict :
|
||||
_stream = json.dumps(_data)
|
||||
content = 'application/json'
|
||||
else:
|
||||
_stream = _data
|
||||
file = StringIO(_stream)
|
||||
bucket = self._bucket_name if 'bucket' not in _args else _args['bucket']
|
||||
file_name = self._file_name if 'file' not in _args else _args['file']
|
||||
self._client.put_object(Bucket=bucket, Key = file_name, Body=_stream,ContentType=content)
|
||||
pass
|
||||
|
@ -0,0 +1,18 @@
|
||||
import json
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
|
||||
class IEncoder (json.JSONEncoder):
|
||||
def default (self,object):
|
||||
if type(object) == np.integer :
|
||||
return int(object)
|
||||
elif type(object) == np.floating:
|
||||
return float(object)
|
||||
elif type(object) == np.ndarray :
|
||||
return object.tolist()
|
||||
elif type(object) == datetime :
|
||||
return object.isoformat()
|
||||
else:
|
||||
return super(IEncoder,self).default(object)
|
||||
|
||||
|
@ -0,0 +1,19 @@
|
||||
"""
|
||||
This file will be intended to handle duckdb database
|
||||
"""
|
||||
|
||||
import duckdb
|
||||
from transport.common import Reader,Writer
|
||||
|
||||
class Duck(Reader):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
self._path = None if 'path' not in _args else _args['path']
|
||||
self._handler = duckdb.connect() if not self._path else duckdb.connect(self._path)
|
||||
|
||||
|
||||
class DuckReader(Duck) :
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
def read(self,**_args) :
|
||||
pass
|
@ -0,0 +1,354 @@
|
||||
#!/usr/bin/env python
|
||||
__doc__ = """
|
||||
(c) 2018 - 2021 data-transport
|
||||
steve@the-phi.com, The Phi Technology LLC
|
||||
https://dev.the-phi.com/git/steve/data-transport.git
|
||||
|
||||
This program performs ETL between 9 supported data sources : Couchdb, Mongodb, Mysql, Mariadb, PostgreSQL, Netezza,Redshift, Sqlite, File
|
||||
LICENSE (MIT)
|
||||
Copyright 2016-2020, The Phi Technology LLC
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
|
||||
Usage :
|
||||
transport --config <path-to-file.json> --procs <number-procs>
|
||||
@TODO: Create tables if they don't exist for relational databases
|
||||
example of configuration :
|
||||
|
||||
1. Move data from a folder to a data-store
|
||||
transport [--folder <path> ] --config <config.json> #-- assuming the configuration doesn't have folder
|
||||
transport --folder <path> --provider <postgresql|mongo|sqlite> --<database|db> <name> --table|doc <document_name>
|
||||
In this case the configuration should look like :
|
||||
{folder:..., target:{}}
|
||||
2. Move data from one source to another
|
||||
transport --config <file.json>
|
||||
{source:{..},target:{..}} or [{source:{..},target:{..}},{source:{..},target:{..}}]
|
||||
|
||||
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import json
|
||||
import sys
|
||||
import transport
|
||||
import time
|
||||
import os
|
||||
|
||||
|
||||
from multiprocessing import Process
|
||||
# SYS_ARGS = {}
|
||||
# if len(sys.argv) > 1:
|
||||
|
||||
# N = len(sys.argv)
|
||||
# for i in range(1,N):
|
||||
# value = None
|
||||
# if sys.argv[i].startswith('--'):
|
||||
# key = sys.argv[i][2:] #.replace('-','')
|
||||
# SYS_ARGS[key] = 1
|
||||
# if i + 1 < N:
|
||||
# value = sys.argv[i + 1] = sys.argv[i+1].strip()
|
||||
# if key and value and not value.startswith('--'):
|
||||
# SYS_ARGS[key] = value
|
||||
|
||||
|
||||
# i += 2
|
||||
class Transporter(Process):
|
||||
"""
|
||||
The transporter (Jason Stathem) moves data from one persistant store to another
|
||||
- callback functions
|
||||
:onFinish callback function when finished
|
||||
:onError callback function when an error occurs
|
||||
:source source data specification
|
||||
:target destination(s) to move the data to
|
||||
"""
|
||||
def __init__(self,**_args):
|
||||
super().__init__()
|
||||
# self.onfinish = _args['onFinish']
|
||||
# self._onerror = _args['onError']
|
||||
self._source = _args['source']
|
||||
self._target = _args['target']
|
||||
|
||||
#
|
||||
# Let's insure we can support multiple targets
|
||||
self._target = [self._target] if type(self._target) != list else self._target
|
||||
pass
|
||||
def run(self):
|
||||
|
||||
_reader = transport.get.etl(source=self._source,target=self._target)
|
||||
#
|
||||
if 'cmd' in self._source or 'query' in self._source :
|
||||
_query = self._source['cmd'] if 'cmd' in self._source else self._source['query']
|
||||
return _reader.read(**_query)
|
||||
else:
|
||||
return _reader.read()
|
||||
|
||||
# def _read(self,**_args):
|
||||
# """
|
||||
# This function
|
||||
# """
|
||||
# _reader = transport.factory.instance(**self._source)
|
||||
# #
|
||||
# # If arguments are provided then a query is to be executed (not just a table dump)
|
||||
# if 'cmd' in self._source or 'query' in self._source :
|
||||
# _query = self._source['cmd'] if 'cmd' in self._source else self._source['query']
|
||||
# return _reader.read(**_query)
|
||||
# else:
|
||||
# return _reader.read()
|
||||
# # return _reader.read() if 'query' not in self._source else _reader.read(**self._source['query'])
|
||||
|
||||
# def _delegate_write(self,_data,**_args):
|
||||
# """
|
||||
# This function will write a data-frame to a designated data-store, The function is built around a delegation design pattern
|
||||
# :data data-frame or object to be written
|
||||
# """
|
||||
# if _data.shape[0] > 0 :
|
||||
# for _target in self._target :
|
||||
# if 'write' not in _target :
|
||||
# _target['context'] = 'write'
|
||||
# # _target['lock'] = True
|
||||
# else:
|
||||
# # _target['write']['lock'] = True
|
||||
# pass
|
||||
# _writer = transport.factory.instance(**_target)
|
||||
# _writer.write(_data,**_args)
|
||||
# if hasattr(_writer,'close') :
|
||||
# _writer.close()
|
||||
|
||||
# def write(self,_df,**_args):
|
||||
# """
|
||||
# """
|
||||
# SEGMENT_COUNT = 6
|
||||
# MAX_ROWS = 1000000
|
||||
# # _df = self.read()
|
||||
# _segments = np.array_split(np.arange(_df.shape[0]),SEGMENT_COUNT) if _df.shape[0] > MAX_ROWS else np.array( [np.arange(_df.shape[0])])
|
||||
# # _index = 0
|
||||
|
||||
|
||||
# for _indexes in _segments :
|
||||
# _fwd_args = {} if not _args else _args
|
||||
|
||||
# self._delegate_write(_df.iloc[_indexes],**_fwd_args)
|
||||
# time.sleep(1)
|
||||
# #
|
||||
# # @TODO: Perhaps consider writing up each segment in a thread/process (speeds things up?)
|
||||
# pass
|
||||
|
||||
def instance(**_args):
|
||||
pthread = Transporter (**_args)
|
||||
pthread.start()
|
||||
return pthread
|
||||
pass
|
||||
# class Post(Process):
|
||||
# def __init__(self,**args):
|
||||
# super().__init__()
|
||||
# self.store = args['target']
|
||||
# if 'provider' not in args['target'] :
|
||||
# pass
|
||||
# self.PROVIDER = args['target']['type']
|
||||
# # self.writer = transport.factory.instance(**args['target'])
|
||||
# else:
|
||||
# self.PROVIDER = args['target']['provider']
|
||||
# self.store['context'] = 'write'
|
||||
# # self.store = args['target']
|
||||
# self.store['lock'] = True
|
||||
# # self.writer = transport.instance(**args['target'])
|
||||
# #
|
||||
# # If the table doesn't exists maybe create it ?
|
||||
# #
|
||||
# self.rows = args['rows']
|
||||
# # self.rows = args['rows'].fillna('')
|
||||
|
||||
# def log(self,**_args) :
|
||||
# if ETL.logger :
|
||||
# ETL.logger.info(**_args)
|
||||
|
||||
# def run(self):
|
||||
# _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows
|
||||
|
||||
# writer = transport.factory.instance(**self.store)
|
||||
# writer.write(_info)
|
||||
# writer.close()
|
||||
|
||||
|
||||
# class ETL (Process):
|
||||
# logger = None
|
||||
# def __init__(self,**_args):
|
||||
# super().__init__()
|
||||
|
||||
# self.name = _args['id'] if 'id' in _args else 'UNREGISTERED'
|
||||
# # if 'provider' not in _args['source'] :
|
||||
# # #@deprecate
|
||||
# # self.reader = transport.factory.instance(**_args['source'])
|
||||
# # else:
|
||||
# # #
|
||||
# # # This is the new interface
|
||||
# # _args['source']['context'] = 'read'
|
||||
|
||||
# # self.reader = transport.instance(**_args['source'])
|
||||
|
||||
# #
|
||||
# # do we have an sql query provided or not ....
|
||||
# # self.sql = _args['source']['sql'] if 'sql' in _args['source'] else None
|
||||
# # self.cmd = _args['source']['cmd'] if 'cmd' in _args['source'] else None
|
||||
# # self._oargs = _args['target'] #transport.factory.instance(**_args['target'])
|
||||
# self._source = _args ['source']
|
||||
# self._target = _args['target']
|
||||
# self._source['context'] = 'read'
|
||||
# self._target['context'] = 'write'
|
||||
|
||||
# self.JOB_COUNT = _args['jobs']
|
||||
# self.jobs = []
|
||||
# # self.logger = transport.factory.instance(**_args['logger'])
|
||||
# def log(self,**_args) :
|
||||
# if ETL.logger :
|
||||
# ETL.logger.info(**_args)
|
||||
|
||||
# def run(self):
|
||||
# # if self.cmd :
|
||||
# # idf = self.reader.read(**self.cmd)
|
||||
# # else:
|
||||
# # idf = self.reader.read()
|
||||
# # idf = pd.DataFrame(idf)
|
||||
# # # idf = idf.replace({np.nan: None}, inplace = True)
|
||||
|
||||
# # idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()]
|
||||
# # self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT)
|
||||
|
||||
# #
|
||||
# # writing the data to a designated data source
|
||||
# #
|
||||
# try:
|
||||
|
||||
|
||||
# _log = {"name":self.name,"rows":{"input":0,"output":0}}
|
||||
# _reader = transport.factory.instance(**self._source)
|
||||
# if 'table' in self._source :
|
||||
# _df = _reader.read()
|
||||
# else:
|
||||
# _df = _reader.read(**self._source['cmd'])
|
||||
# _log['rows']['input'] = _df.shape[0]
|
||||
# #
|
||||
# # Let's write the input data-frame to the target ...
|
||||
# _writer = transport.factory.instance(**self._target)
|
||||
# _writer.write(_df)
|
||||
# _log['rows']['output'] = _df.shape[0]
|
||||
|
||||
# # self.log(module='write',action='partitioning',jobs=self.JOB_COUNT)
|
||||
# # rows = np.array_split(np.arange(0,idf.shape[0]),self.JOB_COUNT)
|
||||
|
||||
# # #
|
||||
# # # @TODO: locks
|
||||
# # for i in np.arange(self.JOB_COUNT) :
|
||||
# # # _id = ' '.join([str(i),' table ',self.name])
|
||||
# # indexes = rows[i]
|
||||
# # segment = idf.loc[indexes,:].copy() #.to_dict(orient='records')
|
||||
# # _name = "partition-"+str(i)
|
||||
# # if segment.shape[0] == 0 :
|
||||
# # continue
|
||||
|
||||
# # proc = Post(target = self._oargs,rows = segment,name=_name)
|
||||
# # self.jobs.append(proc)
|
||||
# # proc.start()
|
||||
|
||||
# # self.log(module='write',action='working',segment=str(self.name),table=self.name,rows=segment.shape[0])
|
||||
# # while self.jobs :
|
||||
# # jobs = [job for job in proc if job.is_alive()]
|
||||
# # time.sleep(1)
|
||||
# except Exception as e:
|
||||
# print (e)
|
||||
# self.log(**_log)
|
||||
# def is_done(self):
|
||||
# self.jobs = [proc for proc in self.jobs if proc.is_alive()]
|
||||
# return len(self.jobs) == 0
|
||||
|
||||
|
||||
# def instance (**_args):
|
||||
# """
|
||||
# path to configuration file
|
||||
# """
|
||||
# _path = _args['path']
|
||||
# _config = {}
|
||||
# jobs = []
|
||||
# if os.path.exists(_path) :
|
||||
# file = open(_path)
|
||||
# _config = json.loads(file.read())
|
||||
# file.close()
|
||||
# if _config and type
|
||||
|
||||
|
||||
# def _instance(**_args):
|
||||
# """
|
||||
# :path ,index, id
|
||||
# :param _info list of objects with {source,target}`
|
||||
# :param logger
|
||||
# """
|
||||
# logger = _args['logger'] if 'logger' in _args else None
|
||||
# if 'path' in _args :
|
||||
# _info = json.loads((open(_args['path'])).read())
|
||||
|
||||
|
||||
# if 'index' in _args :
|
||||
# _index = int(_args['index'])
|
||||
# _info = _info[_index]
|
||||
|
||||
# elif 'id' in _args :
|
||||
# _info = [_item for _item in _info if '_id' in _item and _item['id'] == _args['id']]
|
||||
# _info = _info[0] if _info else _info
|
||||
# else:
|
||||
# _info = _args['info']
|
||||
|
||||
# if logger and type(logger) != str:
|
||||
# ETL.logger = logger
|
||||
# elif logger == 'console':
|
||||
# ETL.logger = transport.factory.instance(provider='console',context='write',lock=True)
|
||||
# if type(_info) in [list,dict] :
|
||||
# _info = _info if type(_info) != dict else [_info]
|
||||
# #
|
||||
# # The assumption here is that the objects within the list are {source,target}
|
||||
# jobs = []
|
||||
# for _item in _info :
|
||||
|
||||
# _item['jobs'] = 5 if 'procs' not in _args else int(_args['procs'])
|
||||
# _job = ETL(**_item)
|
||||
|
||||
# _job.start()
|
||||
# jobs.append(_job)
|
||||
# return jobs
|
||||
|
||||
# else:
|
||||
# return None
|
||||
|
||||
# if __name__ == '__main__' :
|
||||
# _info = json.loads(open (SYS_ARGS['config']).read())
|
||||
# index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else None
|
||||
# procs = []
|
||||
# for _config in _info :
|
||||
# if 'source' in SYS_ARGS :
|
||||
# _config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}}
|
||||
|
||||
# _config['jobs'] = 3 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs'])
|
||||
# etl = ETL (**_config)
|
||||
# if index is None:
|
||||
|
||||
# etl.start()
|
||||
# procs.append(etl)
|
||||
|
||||
# elif _info.index(_config) == index :
|
||||
|
||||
# # print (_config)
|
||||
# procs = [etl]
|
||||
# etl.start()
|
||||
# break
|
||||
# #
|
||||
# #
|
||||
# N = len(procs)
|
||||
# while procs :
|
||||
# procs = [thread for thread in procs if not thread.is_done()]
|
||||
# if len(procs) < N :
|
||||
# print (["Finished ",(N-len(procs)), " remaining ", len(procs)])
|
||||
# N = len(procs)
|
||||
# time.sleep(1)
|
||||
# # print ("We're done !!")
|
@ -0,0 +1,131 @@
|
||||
"""
|
||||
This class is a wrapper around read/write classes of cloud,sql,nosql,other packages
|
||||
The wrapper allows for application of plugins as pre-post conditions.
|
||||
NOTE: Plugins are converted to a pipeline, so we apply a pipeline when reading or writing:
|
||||
- upon initialization we will load plugins
|
||||
- on read/write we apply a pipeline (if passed as an argument)
|
||||
"""
|
||||
from transport.plugins import Plugin, PluginLoader
|
||||
import transport
|
||||
from transport import providers
|
||||
from multiprocessing import Process
|
||||
import time
|
||||
|
||||
import plugin_ix
|
||||
|
||||
class IO:
|
||||
"""
|
||||
Base wrapper class for read/write and support for logs
|
||||
"""
|
||||
def __init__(self,**_args):
|
||||
_agent = _args['agent']
|
||||
plugins = _args['plugins'] if 'plugins' in _args else None
|
||||
|
||||
self._agent = _agent
|
||||
# self._ixloader = plugin_ix.Loader () #-- must indicate where the plugin registry file is
|
||||
self._ixloader = plugin_ix.Loader (registry=plugin_ix.Registry(folder=transport.registry.REGISTRY_PATH))
|
||||
if plugins :
|
||||
self.init_plugins(plugins)
|
||||
|
||||
def meta (self,**_args):
|
||||
if hasattr(self._agent,'meta') :
|
||||
return self._agent.meta(**_args)
|
||||
return []
|
||||
|
||||
def close(self):
|
||||
if hasattr(self._agent,'close') :
|
||||
self._agent.close()
|
||||
# def apply(self):
|
||||
# """
|
||||
# applying pre/post conditions given a pipeline expression
|
||||
# """
|
||||
# for _pointer in self._plugins :
|
||||
# _data = _pointer(_data)
|
||||
def apply(self,_query):
|
||||
if hasattr(self._agent,'apply') :
|
||||
return self._agent.apply(_query)
|
||||
return None
|
||||
def submit(self,_query):
|
||||
return self.delegate('submit',_query)
|
||||
def delegate(self,_name,_query):
|
||||
if hasattr(self._agent,_name) :
|
||||
pointer = getattr(self._agent,_name)
|
||||
return pointer(_query)
|
||||
return None
|
||||
def init_plugins(self,plugins):
|
||||
for _ref in plugins :
|
||||
self._ixloader.set(_ref)
|
||||
|
||||
class IReader(IO):
|
||||
"""
|
||||
This is a wrapper for read functionalities
|
||||
"""
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
|
||||
def read(self,**_args):
|
||||
if 'plugins' in _args :
|
||||
self.init_plugins(_args['plugins'])
|
||||
|
||||
_data = self._agent.read(**_args)
|
||||
# if self._plugins and self._plugins.ratio() > 0 :
|
||||
# _data = self._plugins.apply(_data)
|
||||
#
|
||||
# output data
|
||||
|
||||
#
|
||||
# applying the the design pattern
|
||||
_data = self._ixloader.visitor(_data)
|
||||
return _data
|
||||
class IWriter(IO):
|
||||
def __init__(self,**_args): #_agent,pipeline=None):
|
||||
super().__init__(**_args) #_agent,pipeline)
|
||||
def write(self,_data,**_args):
|
||||
# if 'plugins' in _args :
|
||||
# self._init_plugins(_args['plugins'])
|
||||
if 'plugins' in _args :
|
||||
self.init_plugins(_args['plugins'])
|
||||
|
||||
self._ixloader.visitor(_data)
|
||||
self._agent.write(_data,**_args)
|
||||
|
||||
#
|
||||
# The ETL object in its simplest form is an aggregation of read/write objects
|
||||
# @TODO: ETL can/should aggregate a writer as a plugin and apply it as a process
|
||||
|
||||
class IETL(IReader) :
|
||||
"""
|
||||
This class performs an ETL operation by ineriting a read and adding writes as pipeline functions
|
||||
"""
|
||||
def __init__(self,**_args):
|
||||
super().__init__(agent=transport.get.reader(**_args['source']),plugins=None)
|
||||
if 'target' in _args:
|
||||
self._targets = _args['target'] if type(_args['target']) == list else [_args['target']]
|
||||
else:
|
||||
self._targets = []
|
||||
self.jobs = []
|
||||
#
|
||||
# If the parent is already multiprocessing
|
||||
self._hasParentProcess = False if 'hasParentProcess' not in _args else _args['hasParentProcess']
|
||||
def read(self,**_args):
|
||||
_data = super().read(**_args)
|
||||
_schema = super().meta()
|
||||
for _kwargs in self._targets :
|
||||
if _schema :
|
||||
_kwargs['schema'] = _schema
|
||||
self.post(_data,**_kwargs)
|
||||
|
||||
return _data
|
||||
def run(self) :
|
||||
return self.read()
|
||||
def post (self,_data,**_args) :
|
||||
"""
|
||||
This function returns an instance of a process that will perform the write operation
|
||||
:_args parameters associated with writer object
|
||||
"""
|
||||
writer = transport.get.writer(**_args)
|
||||
if 'schema' in _args :
|
||||
writer.write(_data,schema=_args['schema'])
|
||||
else:
|
||||
writer.write(_data)
|
||||
writer.close()
|
@ -0,0 +1,12 @@
|
||||
"""
|
||||
Steve L. Nyemba, nyemba@gmail.com
|
||||
This namespace implements support for cloud databases couchdb,mongodb, cloudant ...
|
||||
"""
|
||||
# from transport.nosql import couchdb
|
||||
# from transport.nosql import mongodb
|
||||
from . import mongodb
|
||||
from . import couchdb
|
||||
# import mongodb
|
||||
# import couchdb
|
||||
|
||||
cloudant = couchdb
|
@ -0,0 +1,213 @@
|
||||
"""
|
||||
Data-Transport
|
||||
Steve L. Nyemba, The Phi Technology
|
||||
|
||||
This file is a wrapper around couchdb using IBM Cloudant SDK that has an interface to couchdb
|
||||
|
||||
"""
|
||||
import cloudant
|
||||
import json
|
||||
import sys
|
||||
# from transport.common import Reader, Writer
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class Couch:
|
||||
"""
|
||||
This class is a wrapper for read/write against couchdb. The class captures common operations for read/write.
|
||||
@param url host & port reference default http://localhost:5984
|
||||
@param doc user id involved
|
||||
@param dbname database name (target)
|
||||
"""
|
||||
def __init__(self,**args):
|
||||
url = args['url'] if 'url' in args else 'http://localhost:5984'
|
||||
self._id = args['doc']
|
||||
dbname = args['dbname']
|
||||
if 'username' not in args and 'password' not in args :
|
||||
self.server = cloudant.CouchDB(None,None,url=url)
|
||||
else:
|
||||
self.server = cloudant.CouchDB(args['username'],args['password'],url=url)
|
||||
self.server.connect()
|
||||
|
||||
if dbname in self.server.all_dbs() :
|
||||
self.dbase = self.server.get(dbname,dbname,True)
|
||||
#
|
||||
# @TODO Check if the database exists ...
|
||||
#
|
||||
doc = cloudant.document.Document(self.dbase,self._id) #self.dbase.get(self._id)
|
||||
if not doc.exists():
|
||||
doc = self.dbase.create_document({"_id":self._id})
|
||||
doc.save()
|
||||
else:
|
||||
self.dbase = None
|
||||
"""
|
||||
Insuring the preconditions are met for processing
|
||||
"""
|
||||
def isready(self):
|
||||
p = self.server.metadata() != {}
|
||||
if p == False or not self.dbase:
|
||||
return False
|
||||
#
|
||||
# At this point we are sure that the server is connected
|
||||
# We are also sure that the database actually exists
|
||||
#
|
||||
doc = cloudant.document.Document(self.dbase,self._id)
|
||||
# q = self.dbase.all_docs(key=self._id)['rows']
|
||||
# if not q :
|
||||
if not doc.exists():
|
||||
return False
|
||||
return True
|
||||
|
||||
def view(self,**args):
|
||||
"""
|
||||
The function will execute a view (provivded a user is authenticated)
|
||||
:id design document _design/xxxx (provide full name with _design prefix)
|
||||
:view_name name of the view i.e
|
||||
:key(s) key(s) to be used to filter the content
|
||||
"""
|
||||
document = cloudant.design_document.DesignDocument(self.dbase,args['id'])
|
||||
document.fetch()
|
||||
params = {'group_level':1,'group':True}
|
||||
if 'key' in args :
|
||||
params ['key'] = args['key']
|
||||
elif 'keys' in args :
|
||||
params['keys'] = args['keys']
|
||||
return document.get_view(args['view_name'])(**params)['rows']
|
||||
|
||||
|
||||
|
||||
|
||||
class Reader(Couch):
|
||||
"""
|
||||
This function will read an attachment from couchdb and return it to calling code. The attachment must have been placed before hand (otherwise oops)
|
||||
@T: Account for security & access control
|
||||
"""
|
||||
def __init__(self,**args):
|
||||
"""
|
||||
@param filename filename (attachment)
|
||||
"""
|
||||
#
|
||||
# setting the basic parameters for
|
||||
Couch.__init__(self,**args)
|
||||
if 'filename' in args :
|
||||
self.filename = args['filename']
|
||||
else:
|
||||
self.filename = None
|
||||
|
||||
|
||||
def stream(self):
|
||||
#
|
||||
# @TODO Need to get this working ...
|
||||
#
|
||||
document = cloudant.document.Document(self.dbase,self._id)
|
||||
# content = self.dbase.fetch_attachment(self._id,self.filename).split('\n') ;
|
||||
content = self.get_attachment(self.filename)
|
||||
for row in content:
|
||||
yield row
|
||||
|
||||
def read(self,**args):
|
||||
if self.filename is not None:
|
||||
self.stream()
|
||||
else:
|
||||
return self.basic_read()
|
||||
def basic_read(self):
|
||||
document = cloudant.document.Document(self.dbase,self._id)
|
||||
|
||||
# document = self.dbase.get(self._id)
|
||||
if document.exists() :
|
||||
document.fetch()
|
||||
document = dict(document)
|
||||
del document['_rev']
|
||||
else:
|
||||
document = {}
|
||||
return document
|
||||
|
||||
class Writer(Couch):
|
||||
"""
|
||||
This class will write on a couchdb document provided a scope
|
||||
The scope is the attribute that will be on the couchdb document
|
||||
"""
|
||||
def __init__(self,**args):
|
||||
"""
|
||||
@param uri host & port reference
|
||||
@param uid user id involved
|
||||
@param filename filename (attachment)
|
||||
@param dbname database name (target)
|
||||
"""
|
||||
|
||||
super().__init__(self,**args)
|
||||
def set (self,info):
|
||||
document = cloudant.document.Document(self.dbase,self._id)
|
||||
if document.exists() :
|
||||
keys = list(set(document.keys()) - set(['_id','_rev','_attachments']))
|
||||
for id in keys :
|
||||
document.field_set(document,id,None)
|
||||
for id in info :
|
||||
value = info[id]
|
||||
document.info(document,id,value)
|
||||
|
||||
document.save()
|
||||
pass
|
||||
else:
|
||||
_document = dict({"_id":self._id},**args)
|
||||
document.create_document(_document)
|
||||
def write(self,info):
|
||||
"""
|
||||
write a given attribute to a document database
|
||||
@info object to be written to the to an attribute. this
|
||||
"""
|
||||
|
||||
# document = self.dbase.get(self._id)
|
||||
document = cloudant.document.Document(self.dbase,self._id) #.get(self._id)
|
||||
if document.exists() is False :
|
||||
document = self.dbase.create_document({"_id":self._id})
|
||||
# label = params['label']
|
||||
# row = params['row']
|
||||
# if label not in document :
|
||||
# document[label] = []
|
||||
# document[label].append(row)
|
||||
for key in info :
|
||||
if key in document and type(document[key]) == list :
|
||||
document[key] += info[key]
|
||||
else:
|
||||
document[key] = info[key]
|
||||
|
||||
document.save()
|
||||
# self.dbase.bulk_docs([document])
|
||||
# self.dbase.save_doc(document)
|
||||
|
||||
def upload(self,**args):
|
||||
"""
|
||||
:param name name of the file to be uploaded
|
||||
:param data content of the file (binary or text)
|
||||
:param content_type (default)
|
||||
"""
|
||||
mimetype = args['content_type'] if 'content_type' in args else 'text/plain'
|
||||
document = cloudant.document.Document(self.dbase,self.uid)
|
||||
document.put_attachment(self.dbase,args['filename'],mimetype,args['content'])
|
||||
document.save()
|
||||
|
||||
def archive(self,params=None):
|
||||
"""
|
||||
This function will archive the document onto itself.
|
||||
"""
|
||||
# document = self.dbase.all_docs(self._id,include_docs=True)
|
||||
document = cloudant.document.Document(self.dbase,self.filename)
|
||||
document.fetch()
|
||||
content = {}
|
||||
# _doc = {}
|
||||
for id in document:
|
||||
if id not in ['_id','_rev','_attachments'] :
|
||||
content[id] = document[id]
|
||||
del document[id]
|
||||
|
||||
content = json.dumps(content)
|
||||
# document= _doc
|
||||
now = str(datetime.today())
|
||||
|
||||
name = '-'.join([document['_id'] , now,'.json'])
|
||||
self.upload(filename=name,data=content,content_type='application/json')
|
||||
# self.dbase.bulk_docs([document])
|
||||
# self.dbase.put_attachment(document,content,name,'application/json')
|
||||
# document.put_attachment(self.dbase,name,'application/json',content)
|
||||
# document.save()
|
@ -0,0 +1,263 @@
|
||||
"""
|
||||
Data Transport - 1.0
|
||||
Steve L. Nyemba, The Phi Technology LLC
|
||||
|
||||
This file is a wrapper around mongodb for reading/writing content against a mongodb server and executing views (mapreduce)
|
||||
"""
|
||||
from pymongo import MongoClient
|
||||
import bson
|
||||
from bson.objectid import ObjectId
|
||||
from bson.binary import Binary
|
||||
# import nujson as json
|
||||
from datetime import datetime
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
# import gridfs
|
||||
from gridfs import GridFS
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
from multiprocessing import Lock, RLock
|
||||
from transport.common import IEncoder
|
||||
|
||||
class Mongo :
|
||||
lock = RLock()
|
||||
"""
|
||||
Basic mongodb functions are captured here
|
||||
"""
|
||||
def __init__(self,**args):
|
||||
"""
|
||||
:dbname database name/identifier
|
||||
:host host and port of the database by default localhost:27017
|
||||
:username username for authentication
|
||||
:password password for current user
|
||||
"""
|
||||
self.host = 'localhost' if 'host' not in args else args['host']
|
||||
if ':' not in self.host and 'port' in args :
|
||||
self.host = ':'.join([self.host,str(args['port'])])
|
||||
self.mechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism']
|
||||
# authSource=(args['authSource'] if 'authSource' in args else self.dbname)
|
||||
self._lock = False if 'lock' not in args else args['lock']
|
||||
self.dbname = None
|
||||
username = password = None
|
||||
if 'auth_file' in args :
|
||||
_info = json.loads((open(args['auth_file'])).read())
|
||||
|
||||
|
||||
else:
|
||||
_info = {}
|
||||
_args = dict(args,**_info)
|
||||
_map = {'dbname':'db','database':'db','table':'uid','collection':'uid','col':'uid','doc':'uid'}
|
||||
for key in _args :
|
||||
if key in ['username','password'] :
|
||||
username = _args['username'] if key=='username' else username
|
||||
password = _args['password'] if key == 'password' else password
|
||||
continue
|
||||
value = _args[key]
|
||||
if key in _map :
|
||||
key = _map[key]
|
||||
|
||||
self.setattr(key,value)
|
||||
#
|
||||
# Let us perform aliasing in order to remain backwards compatible
|
||||
|
||||
self.dbname = self.db if hasattr(self,'db')else self.dbname
|
||||
self.collection = _args['table'] if 'table' in _args else (_args['doc'] if 'doc' in _args else (_args['collection'] if 'collection' in _args else None))
|
||||
if username and password :
|
||||
self.client = MongoClient(self.host,
|
||||
username=username,
|
||||
password=password ,
|
||||
authSource=self.authSource,
|
||||
authMechanism=self.mechanism)
|
||||
|
||||
else:
|
||||
self.client = MongoClient(self.host,maxPoolSize=10000)
|
||||
|
||||
self.db = self.client[self.dbname]
|
||||
|
||||
def isready(self):
|
||||
p = self.dbname in self.client.list_database_names()
|
||||
q = self.collection in self.client[self.dbname].list_collection_names()
|
||||
return p and q
|
||||
def setattr(self,key,value):
|
||||
_allowed = ['host','port','db','doc','collection','authSource','mechanism']
|
||||
if key in _allowed :
|
||||
setattr(self,key,value)
|
||||
pass
|
||||
def close(self):
|
||||
self.client.close()
|
||||
def meta(self,**_args):
|
||||
return []
|
||||
class Reader(Mongo):
|
||||
"""
|
||||
This class will read from a mongodb data store and return the content of a document (not a collection)
|
||||
"""
|
||||
def __init__(self,**args):
|
||||
Mongo.__init__(self,**args)
|
||||
def read(self,**args):
|
||||
|
||||
if 'mongo' in args or 'cmd' in args or 'pipeline' in args:
|
||||
#
|
||||
# @TODO:
|
||||
cmd = {}
|
||||
if 'aggregate' not in cmd and 'aggregate' not in args:
|
||||
cmd['aggregate'] = self.collection
|
||||
elif 'aggregate' in args :
|
||||
cmd['aggregate'] = args['aggregate']
|
||||
if 'pipeline' in args :
|
||||
cmd['pipeline']= args['pipeline']
|
||||
|
||||
if 'pipeline' not in args or 'aggregate' not in cmd :
|
||||
cmd = args['mongo'] if 'mongo' in args else args['cmd']
|
||||
if "aggregate" in cmd :
|
||||
if "allowDiskUse" not in cmd :
|
||||
cmd["allowDiskUse"] = True
|
||||
if "cursor" not in cmd :
|
||||
cmd["cursor"] = {}
|
||||
r = []
|
||||
out = self.db.command(cmd)
|
||||
#@TODO: consider using a yield (generator) works wonders
|
||||
while True :
|
||||
if 'values' in out :
|
||||
r += out['values']
|
||||
if 'cursor' in out :
|
||||
key = 'firstBatch' if 'firstBatch' in out['cursor'] else 'nextBatch'
|
||||
else:
|
||||
key = 'n'
|
||||
if 'cursor' in out and out['cursor'][key] :
|
||||
r += list(out['cursor'][key])
|
||||
elif key in out and out[key]:
|
||||
r.append (out[key])
|
||||
# yield out['cursor'][key]
|
||||
if key not in ['firstBatch','nextBatch'] or ('cursor' in out and out['cursor']['id'] == 0) :
|
||||
break
|
||||
else:
|
||||
out = self.db.command({"getMore":out['cursor']['id'],"collection":out['cursor']['ns'].split(".")[-1]})
|
||||
|
||||
|
||||
return pd.DataFrame(r)
|
||||
else:
|
||||
|
||||
|
||||
if 'table' in args or 'collection' in args :
|
||||
if 'table' in args:
|
||||
_uid = args['table']
|
||||
elif 'collection' in args :
|
||||
_uid = args['collection']
|
||||
else:
|
||||
_uid = self.collection
|
||||
else:
|
||||
_uid = self.collection
|
||||
collection = self.db[_uid]
|
||||
_filter = args['filter'] if 'filter' in args else {}
|
||||
_df = pd.DataFrame(collection.find(_filter))
|
||||
columns = _df.columns.tolist()[1:]
|
||||
return _df[columns]
|
||||
def view(self,**args):
|
||||
"""
|
||||
This function is designed to execute a view (map/reduce) operation
|
||||
"""
|
||||
pass
|
||||
class Writer(Mongo):
|
||||
"""
|
||||
This class is designed to write to a mongodb collection within a database
|
||||
"""
|
||||
def __init__(self,**args):
|
||||
Mongo.__init__(self,**args)
|
||||
def upload(self,**args) :
|
||||
"""
|
||||
This function will upload a file to the current database (using GridFS)
|
||||
:param data binary stream/text to be stored
|
||||
:param filename filename to be used
|
||||
:param encoding content_encoding (default utf-8)
|
||||
|
||||
"""
|
||||
if 'encoding' not in args :
|
||||
args['encoding'] = 'utf-8'
|
||||
gfs = GridFS(self.db)
|
||||
gfs.put(**args)
|
||||
|
||||
def archive(self):
|
||||
"""
|
||||
This function will archive documents to the
|
||||
"""
|
||||
collection = self.db[self.collection]
|
||||
rows = list(collection.find())
|
||||
for row in rows :
|
||||
if type(row['_id']) == ObjectId :
|
||||
row['_id'] = str(row['_id'])
|
||||
stream = Binary(json.dumps(collection,cls=IEncoder).encode())
|
||||
collection.delete_many({})
|
||||
now = "-".join([str(datetime.now().year()),str(datetime.now().month), str(datetime.now().day)])
|
||||
name = ".".join([self.collection,'archive',now])+".json"
|
||||
description = " ".join([self.collection,'archive',str(len(rows))])
|
||||
self.upload(filename=name,data=stream,description=description,content_type='application/json')
|
||||
# gfs = GridFS(self.db)
|
||||
# gfs.put(filename=name,description=description,data=stream,encoding='utf-8')
|
||||
# self.write({{"filename":name,"file":stream,"description":descriptions}})
|
||||
|
||||
|
||||
pass
|
||||
|
||||
def write(self,info,**_args):
|
||||
"""
|
||||
This function will write to a given collection i.e add a record to a collection (no updates)
|
||||
@param info new record in the collection to be added
|
||||
"""
|
||||
# document = self.db[self.collection].find()
|
||||
#collection = self.db[self.collection]
|
||||
# if type(info) == list :
|
||||
# self.db[self.collection].insert_many(info)
|
||||
# else:
|
||||
try:
|
||||
if 'table' in _args or 'collection' in _args :
|
||||
_uid = _args['table'] if 'table' in _args else _args['collection']
|
||||
else:
|
||||
_uid = self.collection if 'doc' not in _args else _args['doc']
|
||||
if self._lock :
|
||||
Mongo.lock.acquire()
|
||||
|
||||
if type(info) == list or type(info) == pd.DataFrame :
|
||||
if type(info) == pd.DataFrame :
|
||||
info = info.to_dict(orient='records')
|
||||
# info if type(info) == list else info.to_dict(orient='records')
|
||||
info = json.loads(json.dumps(info,cls=IEncoder))
|
||||
self.db[_uid].insert_many(info)
|
||||
else:
|
||||
#
|
||||
# sometimes a dictionary can have keys with arrays (odd shaped)
|
||||
#
|
||||
_keycount = len(info.keys())
|
||||
_arraycount = [len(info[key]) for key in info if type(info[key]) in (list,np.array,np.ndarray)]
|
||||
if _arraycount and len(_arraycount) == _keycount and np.max(_arraycount) == np.min(_arraycount) :
|
||||
#
|
||||
# In case an object with consistent structure is passed, we store it accordingly
|
||||
#
|
||||
self.write(pd.DataFrame(info),**_args)
|
||||
else:
|
||||
self.db[_uid].insert_one(json.loads(json.dumps(info,cls=IEncoder)))
|
||||
finally:
|
||||
if self._lock :
|
||||
Mongo.lock.release()
|
||||
def set(self,document):
|
||||
"""
|
||||
if no identifier is provided the function will delete the entire collection and set the new document.
|
||||
Please use this function with great care (archive the content first before using it... for safety)
|
||||
"""
|
||||
|
||||
collection = self.db[self.collection]
|
||||
if collection.count_documents() > 0 and '_id' in document:
|
||||
id = document['_id']
|
||||
del document['_id']
|
||||
collection.find_one_and_replace({'_id':id},document)
|
||||
else:
|
||||
#
|
||||
# Nothing to be done if we did not find anything
|
||||
#
|
||||
pass
|
||||
# collection.delete_many({})
|
||||
# self.write(info)
|
||||
def close(self):
|
||||
Mongo.close(self)
|
||||
# collecton.update_one({"_id":self.collection},document,True)
|
||||
|
@ -0,0 +1 @@
|
||||
from . import files, http, rabbitmq, callback, files, console
|
@ -0,0 +1,49 @@
|
||||
"""
|
||||
This module uses callback architectural style as a writer to enable user-defined code to handle the output of a reader
|
||||
The intent is to allow users to have control over the output of data to handle things like logging, encryption/decryption and other
|
||||
"""
|
||||
import queue
|
||||
from threading import Thread, Lock
|
||||
# from transport.common import Reader,Writer
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
class Writer :
|
||||
lock = Lock()
|
||||
_queue = {'default':queue.Queue()}
|
||||
def __init__(self,**_args):
|
||||
self._cache = {}
|
||||
self._callback = _args['callback'] if 'callback' in _args else None
|
||||
self._id = _args['id'] if 'id' in _args else 'default'
|
||||
if self._id not in Writer._queue :
|
||||
Writer._queue[self._id] = queue.Queue()
|
||||
thread = Thread(target=self._forward)
|
||||
thread.start()
|
||||
def _forward(self):
|
||||
_q = Writer._queue[self._id]
|
||||
_data = _q.get()
|
||||
_q.task_done()
|
||||
self._callback(_data)
|
||||
|
||||
def has(self,**_args) :
|
||||
return self._callback is not None
|
||||
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
This will empty the queue and have it ready for another operation
|
||||
"""
|
||||
_q = Writer._queue[self._id]
|
||||
with _q.mutex:
|
||||
_q.queue.clear()
|
||||
_q.all_tasks_done.notify_all()
|
||||
|
||||
def write(self,_data,**_args):
|
||||
_id = _args['id'] if 'id' in _args else self._id
|
||||
|
||||
_q = Writer._queue[_id]
|
||||
_q.put(_data)
|
||||
_q.join()
|
||||
|
||||
|
||||
# self.callback = print
|
@ -0,0 +1,10 @@
|
||||
"""
|
||||
This class uses classback pattern to allow output to be printed to the console (debugging)
|
||||
"""
|
||||
from . import callback
|
||||
|
||||
|
||||
class Writer (callback.Writer):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(callback=print)
|
||||
|
@ -0,0 +1,69 @@
|
||||
"""
|
||||
This file is a wrapper around pandas built-in functionalities to handle character delimited files
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
class File :
|
||||
def __init__(self,**params):
|
||||
"""
|
||||
|
||||
@param path absolute path of the file to be read
|
||||
"""
|
||||
self.path = params['path'] if 'path' in params else None
|
||||
self.delimiter = params['delimiter'] if 'delimiter' in params else ','
|
||||
|
||||
def isready(self):
|
||||
return os.path.exists(self.path)
|
||||
def meta(self,**_args):
|
||||
return []
|
||||
|
||||
class Reader (File):
|
||||
"""
|
||||
This class is designed to read data from disk (location on hard drive)
|
||||
@pre : isready() == True
|
||||
"""
|
||||
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
|
||||
def read(self,**args):
|
||||
_path = self.path if 'path' not in args else args['path']
|
||||
_delimiter = self.delimiter if 'delimiter' not in args else args['delimiter']
|
||||
return pd.read_csv(_path,delimiter=self.delimiter)
|
||||
def stream(self,**args):
|
||||
raise Exception ("streaming needs to be implemented")
|
||||
class Writer (File):
|
||||
|
||||
"""
|
||||
This function writes output to disk in a designated location. The function will write a text to a text file
|
||||
- If a delimiter is provided it will use that to generate a xchar-delimited file
|
||||
- If not then the object will be dumped as is
|
||||
"""
|
||||
# THREAD_LOCK = RLock()
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
self._mode = 'w' if 'mode' not in _args else _args['mode']
|
||||
|
||||
def write(self,info,**_args):
|
||||
"""
|
||||
This function writes a record to a designated file
|
||||
@param label <passed|broken|fixed|stats>
|
||||
@param row row to be written
|
||||
"""
|
||||
try:
|
||||
|
||||
_delim = self.delimiter if 'delimiter' not in _args else _args['delimiter']
|
||||
_path = self.path if 'path' not in _args else _args['path']
|
||||
_mode = self._mode if 'mode' not in _args else _args['mode']
|
||||
info.to_csv(_path,index=False,sep=_delim)
|
||||
|
||||
pass
|
||||
except Exception as e:
|
||||
#
|
||||
# Not sure what should be done here ...
|
||||
print (e)
|
||||
pass
|
||||
finally:
|
||||
# DiskWriter.THREAD_LOCK.release()
|
||||
pass
|
@ -0,0 +1,88 @@
|
||||
from flask import request, session
|
||||
from datetime import datetime
|
||||
import re
|
||||
# from transport.common import Reader, Writer
|
||||
import json
|
||||
import requests
|
||||
from io import StringIO
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class Reader:
|
||||
"""
|
||||
This class is designed to read data from an Http request file handler provided to us by flask
|
||||
The file will be heald in memory and processed accordingly
|
||||
NOTE: This is inefficient and can crash a micro-instance (becareful)
|
||||
"""
|
||||
|
||||
def __init__(self,**_args):
|
||||
self._url = _args['url']
|
||||
self._headers = None if 'headers' not in _args else _args['headers']
|
||||
|
||||
# def isready(self):
|
||||
# return self.file_length > 0
|
||||
def format(self,_response):
|
||||
_mimetype= _response.headers['Content-Type']
|
||||
if _mimetype == 'text/csv' or 'text/csv':
|
||||
_content = _response.text
|
||||
return pd.read_csv(StringIO(_content))
|
||||
#
|
||||
# @TODO: Add support for excel, JSON and other file formats that fit into a data-frame
|
||||
#
|
||||
|
||||
return _response.text
|
||||
def read(self,**_args):
|
||||
if self._headers :
|
||||
r = requests.get(self._url,headers = self._headers)
|
||||
else:
|
||||
r = requests.get(self._url,headers = self._headers)
|
||||
return self.format(r)
|
||||
|
||||
class Writer:
|
||||
"""
|
||||
This class is designed to submit data to an endpoint (url)
|
||||
"""
|
||||
def __init__(self,**_args):
|
||||
"""
|
||||
@param key required session key
|
||||
"""
|
||||
self._url = _args['url']
|
||||
self._name = _args['name']
|
||||
self._method = 'post' if 'method' not in _args else _args['method']
|
||||
|
||||
# self.session = params['queue']
|
||||
# self.session['sql'] = []
|
||||
# self.session['csv'] = []
|
||||
# self.tablename = re.sub('..+$','',params['filename'])
|
||||
# self.session['uid'] = params['uid']
|
||||
#self.xchar = params['xchar']
|
||||
|
||||
|
||||
def format_sql(self,row):
|
||||
values = "','".join([col.replace('"','').replace("'",'') for col in row])
|
||||
return "".join(["INSERT INTO :table VALUES('",values,"');\n"]).replace(':table',self.tablename)
|
||||
def isready(self):
|
||||
return True
|
||||
def write(self,_data,**_args):
|
||||
#
|
||||
#
|
||||
_method = self._method if 'method' not in _args else _args['method']
|
||||
_method = _method.lower()
|
||||
_mimetype = 'text/csv'
|
||||
if type(_data) == dict :
|
||||
_mimetype = 'application/json'
|
||||
_content = _data
|
||||
else:
|
||||
_content = _data.to_dict(orient='records')
|
||||
_headers = {'Content-Type':_mimetype}
|
||||
_pointer = getattr(requests,_method)
|
||||
|
||||
_pointer ({self._name:_content},headers=_headers)
|
||||
|
||||
|
||||
# label = params['label']
|
||||
# row = params ['row']
|
||||
|
||||
# if label == 'usable':
|
||||
# self.session['csv'].append(self.format(row,','))
|
||||
# self.session['sql'].append(self.format_sql(row))
|
@ -0,0 +1,272 @@
|
||||
"""
|
||||
Data Transport - 1.0
|
||||
Steve L. Nyemba, The Phi Technology LLC
|
||||
|
||||
This file is a wrapper around rabbitmq server for reading and writing content to a queue (exchange)
|
||||
|
||||
"""
|
||||
import pika
|
||||
from datetime import datetime
|
||||
import re
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
# if sys.version_info[0] > 2 :
|
||||
# from transport.common import Reader, Writer
|
||||
# else:
|
||||
# from common import Reader, Writer
|
||||
import json
|
||||
from multiprocessing import RLock
|
||||
class MessageQueue:
|
||||
"""
|
||||
This class hierarchy is designed to handle interactions with a queue server using pika framework (our tests are based on rabbitmq)
|
||||
:host
|
||||
:xid identifier of the exchange
|
||||
:qid identifier of the queue
|
||||
"""
|
||||
def __init__(self,**params):
|
||||
self.host= 'localhost' if 'host' not in params else params['host'] #-- location of the queue server
|
||||
self.port= 5672 if 'port' not in params else params['port']
|
||||
self.virtual_host = '/' if 'vhost' not in params else params['vhost']
|
||||
self.exchange = params['exchange'] if 'exchange' in params else 'amq.direct' #-- exchange
|
||||
self.queue = params['queue'] if 'queue' in params else 'demo'
|
||||
self.connection = None
|
||||
self.channel = None
|
||||
|
||||
self.name = self.__class__.__name__.lower() if 'name' not in params else params['name']
|
||||
|
||||
username = password = None
|
||||
if 'username' in params :
|
||||
username = params['username']
|
||||
password = params['password']
|
||||
if 'auth_file' in params :
|
||||
_info = json.loads((open(params['auth_file'])).read())
|
||||
username=_info['username']
|
||||
password=_info['password']
|
||||
self.virtual_host = _info['virtual_host'] if 'virtual_host' in _info else self.virtual_host
|
||||
self.exchange = _info['exchange'] if 'exchange' in _info else self.exchange
|
||||
self.queue = _info['queue'] if 'queue' in _info else self.queue
|
||||
|
||||
self.credentials= pika.PlainCredentials('guest','guest')
|
||||
if 'username' in params :
|
||||
self.credentials = pika.PlainCredentials(
|
||||
params['username'],
|
||||
('' if 'password' not in params else params['password'])
|
||||
)
|
||||
|
||||
def init(self,label=None):
|
||||
properties = pika.ConnectionParameters(host=self.host,port=self.port,virtual_host=self.virtual_host,
|
||||
client_properties={'connection_name':self.name},
|
||||
credentials=self.credentials)
|
||||
self.connection = pika.BlockingConnection(properties)
|
||||
self.channel = self.connection.channel()
|
||||
self.info = self.channel.exchange_declare(exchange=self.exchange,exchange_type='direct',durable=True)
|
||||
if label is None:
|
||||
self.qhandler = self.channel.queue_declare(queue=self.queue,durable=True)
|
||||
else:
|
||||
self.qhandler = self.channel.queue_declare(queue=label,durable=True)
|
||||
|
||||
self.channel.queue_bind(exchange=self.exchange,queue=self.qhandler.method.queue)
|
||||
|
||||
def isready(self):
|
||||
#self.init()
|
||||
resp = self.connection is not None and self.connection.is_open
|
||||
# self.close()
|
||||
return resp
|
||||
def finalize(self):
|
||||
pass
|
||||
def close(self):
|
||||
if self.connection.is_closed == False :
|
||||
self.channel.close()
|
||||
self.connection.close()
|
||||
|
||||
class Writer(MessageQueue):
|
||||
"""
|
||||
This class is designed to publish content to an AMQP (Rabbitmq)
|
||||
The class will rely on pika to implement this functionality
|
||||
|
||||
We will publish information to a given queue for a given exchange
|
||||
"""
|
||||
def __init__(self,**params):
|
||||
#self.host= params['host']
|
||||
#self.exchange = params['uid']
|
||||
#self.queue = params['queue']
|
||||
MessageQueue.__init__(self,**params);
|
||||
self.init()
|
||||
def write(self,data,_type='text/plain'):
|
||||
"""
|
||||
This function writes a stream of data to the a given queue
|
||||
@param object object to be written (will be converted to JSON)
|
||||
@TODO: make this less chatty
|
||||
"""
|
||||
|
||||
stream = json.dumps(data) if isinstance(data,dict) else data
|
||||
self.channel.basic_publish(
|
||||
exchange=self.exchange,
|
||||
routing_key=self.queue,
|
||||
body=stream,
|
||||
properties=pika.BasicProperties(content_type=_type,delivery_mode=2)
|
||||
);
|
||||
# self.close()
|
||||
|
||||
def flush(self):
|
||||
self.init()
|
||||
_mode = 1 #-- Non persistent
|
||||
self.channel.queue_delete( queue=self.queue);
|
||||
self.close()
|
||||
|
||||
class Reader(MessageQueue):
|
||||
"""
|
||||
This class will read from a queue provided an exchange, queue and host
|
||||
@TODO: Account for security and virtualhosts
|
||||
"""
|
||||
|
||||
def __init__(self,**params):
|
||||
"""
|
||||
@param host host
|
||||
@param uid exchange identifier
|
||||
@param qid queue identifier
|
||||
"""
|
||||
|
||||
#self.host= params['host']
|
||||
#self.exchange = params['uid']
|
||||
#self.queue = params['qid']
|
||||
MessageQueue.__init__(self,**params);
|
||||
# self.init()
|
||||
self.durable = False if 'durable' not in params else params['durable']
|
||||
# if 'durable' in params :
|
||||
# self.durable = True
|
||||
# else:
|
||||
# self.durable = False
|
||||
self.size = -1
|
||||
self.data = {}
|
||||
# def init(self,qid):
|
||||
|
||||
# properties = pika.ConnectionParameters(host=self.host)
|
||||
# self.connection = pika.BlockingConnection(properties)
|
||||
# self.channel = self.connection.channel()
|
||||
# self.channel.exchange_declare(exchange=self.exchange,type='direct',durable=True)
|
||||
|
||||
# self.info = self.channel.queue_declare(queue=qid,durable=True)
|
||||
|
||||
|
||||
def callback(self,channel,method,header,stream):
|
||||
"""
|
||||
This is the callback function designed to process the data stream from the queue
|
||||
|
||||
"""
|
||||
|
||||
r = []
|
||||
# if re.match("^\{|\[",stream) is not None:
|
||||
if stream.startswith(b'{') or stream.startswith(b'['):
|
||||
r = json.loads(stream)
|
||||
else:
|
||||
|
||||
r = stream
|
||||
|
||||
qid = self.qhandler.method.queue
|
||||
if qid not in self.data :
|
||||
self.data[qid] = []
|
||||
|
||||
self.data[qid].append(r)
|
||||
#
|
||||
# We stop reading when the all the messages of the queue are staked
|
||||
#
|
||||
if self.size == len(self.data[qid]) or len(self.data[qid]) == self.info.method.message_count:
|
||||
self.close()
|
||||
|
||||
def read(self,**args):
|
||||
"""
|
||||
This function will read, the first message from a queue
|
||||
@TODO:
|
||||
Implement channel.basic_get in order to retrieve a single message at a time
|
||||
Have the number of messages retrieved be specified by size (parameter)
|
||||
"""
|
||||
r = {}
|
||||
self.size = -1 if 'size' in args else int(args['size'])
|
||||
#
|
||||
# We enabled the reader to be able to read from several queues (sequentially for now)
|
||||
# The qid parameter will be an array of queues the reader will be reading from
|
||||
#
|
||||
if isinstance(self.queue,str) :
|
||||
self.queue = [self.queue]
|
||||
|
||||
for qid in self.queue:
|
||||
self.init(qid)
|
||||
# r[qid] = []
|
||||
|
||||
if self.qhandler.method.message_count > 0:
|
||||
|
||||
self.channel.basic_consume(queue=qid,on_message_callback=self.callback,auto_ack=False);
|
||||
self.channel.start_consuming()
|
||||
else:
|
||||
|
||||
pass
|
||||
#self.close()
|
||||
# r[qid].append( self.data)
|
||||
|
||||
return self.data
|
||||
class QueueListener(MessageQueue):
|
||||
lock = RLock()
|
||||
"""
|
||||
This class is designed to have an active listener (worker) against a specified Exchange/Queue
|
||||
It is initialized as would any other object and will require a callback function to address the objects returned.
|
||||
"""
|
||||
def __init__(self,**args):
|
||||
MessageQueue.__init__(self,**args)
|
||||
self.listen = self.read
|
||||
self.apply = args['apply'] if 'apply' in args else print
|
||||
self.lock = False if 'lock' not in args else args['lock']
|
||||
|
||||
def finalize(self,channel,ExceptionReason):
|
||||
pass
|
||||
|
||||
def callback(self,channel,method,header,stream) :
|
||||
_info= {}
|
||||
# if re.match("^\{|\[",stream) is not None:
|
||||
|
||||
|
||||
if stream.startswith(b"[") or stream.startswith(b"{"):
|
||||
_info = json.loads(stream)
|
||||
else:
|
||||
|
||||
_info = stream
|
||||
#
|
||||
# At this point we should invoke the apply function with a lock if need be
|
||||
# @TODO: Establish a vocabulary
|
||||
|
||||
if stream == b'QUIT' :
|
||||
# channel.exit()
|
||||
self.close()
|
||||
if self.lock == True :
|
||||
QueueListener.lock.acquire()
|
||||
try:
|
||||
#
|
||||
# In case the user has not specified a function to apply the data against, it will simply be printed
|
||||
#
|
||||
self.apply(_info)
|
||||
except Exception as e:
|
||||
pass
|
||||
if self.lock == True :
|
||||
QueueListener.lock.release()
|
||||
def read(self):
|
||||
|
||||
self.init(self.queue)
|
||||
|
||||
self.channel.basic_consume(self.queue,self.callback,auto_ack=True);
|
||||
self.channel.start_consuming()
|
||||
|
||||
|
||||
|
||||
class Factory :
|
||||
@staticmethod
|
||||
def instance(**_args):
|
||||
"""
|
||||
:param count number of workers
|
||||
:param apply function workers
|
||||
"""
|
||||
_apply = _args['apply']
|
||||
_count = _args['count']
|
||||
for i in np.arange(_count) :
|
||||
_name = _args['name'] if 'name' in _args else 'worker_'+str(i)
|
||||
transport.factory.instance(provider="rabbit",context="listener",apply=_apply,auth_file=_args['auth_file'])
|
@ -0,0 +1,161 @@
|
||||
"""
|
||||
The functions within are designed to load external files and apply functions against the data
|
||||
The plugins are applied as
|
||||
- post-processing if we are reading data
|
||||
- and pre-processing if we are writing data
|
||||
|
||||
The plugin will use a decorator to identify meaningful functions
|
||||
@TODO: This should work in tandem with loggin (otherwise we don't have visibility into what is going on)
|
||||
"""
|
||||
import importlib as IL
|
||||
import importlib.util
|
||||
import sys
|
||||
import os
|
||||
import pandas as pd
|
||||
import time
|
||||
|
||||
class Plugin :
|
||||
"""
|
||||
Implementing function decorator for data-transport plugins (post-pre)-processing
|
||||
"""
|
||||
def __init__(self,**_args):
|
||||
"""
|
||||
:name name of the plugin
|
||||
:mode restrict to reader/writer
|
||||
:about tell what the function is about
|
||||
"""
|
||||
self._name = _args['name'] if 'name' in _args else None
|
||||
self._version = _args['version'] if 'version' in _args else '0.1'
|
||||
self._doc = _args['doc'] if 'doc' in _args else "N/A"
|
||||
self._mode = _args['mode'] if 'mode' in _args else 'rw'
|
||||
def __call__(self,pointer,**kwargs):
|
||||
def wrapper(_args,**kwargs):
|
||||
return pointer(_args,**kwargs)
|
||||
#
|
||||
# @TODO:
|
||||
# add attributes to the wrapper object
|
||||
#
|
||||
self._name = pointer.__name__ if not self._name else self._name
|
||||
setattr(wrapper,'transport',True)
|
||||
setattr(wrapper,'name',self._name)
|
||||
setattr(wrapper,'version',self._version)
|
||||
setattr(wrapper,'doc',self._doc)
|
||||
return wrapper
|
||||
|
||||
class PluginLoader :
|
||||
"""
|
||||
This class is intended to load a plugin and make it available and assess the quality of the developed plugin
|
||||
"""
|
||||
|
||||
def __init__(self,**_args):
|
||||
"""
|
||||
"""
|
||||
# _names = _args['names'] if 'names' in _args else None
|
||||
# path = _args['path'] if 'path' in _args else None
|
||||
# self._names = _names if type(_names) == list else [_names]
|
||||
self._modules = {}
|
||||
self._names = []
|
||||
self._registry = _args['registry']
|
||||
|
||||
pass
|
||||
def load (self,**_args):
|
||||
"""
|
||||
This function loads a plugin
|
||||
"""
|
||||
self._modules = {}
|
||||
self._names = []
|
||||
path = _args ['path']
|
||||
if os.path.exists(path) :
|
||||
_alias = path.split(os.sep)[-1]
|
||||
spec = importlib.util.spec_from_file_location(_alias, path)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module) #--loads it into sys.modules
|
||||
for _name in dir(module) :
|
||||
if self.isplugin(module,_name) :
|
||||
self._module[_name] = getattr(module,_name)
|
||||
# self._names [_name]
|
||||
def format (self,**_args):
|
||||
uri = _args['alias'],_args['name']
|
||||
# def set(self,_pointer) :
|
||||
def set(self,_key) :
|
||||
"""
|
||||
This function will set a pointer to the list of modules to be called
|
||||
This should be used within the context of using the framework as a library
|
||||
"""
|
||||
if type(_key).__name__ == 'function':
|
||||
#
|
||||
# The pointer is in the code provided by the user and loaded in memory
|
||||
#
|
||||
_pointer = _key
|
||||
_key = 'inline@'+_key.__name__
|
||||
# self._names.append(_key.__name__)
|
||||
else:
|
||||
_pointer = self._registry.get(key=_key)
|
||||
|
||||
if _pointer :
|
||||
self._modules[_key] = _pointer
|
||||
self._names.append(_key)
|
||||
|
||||
def isplugin(self,module,name):
|
||||
"""
|
||||
This function determines if a module is a recognized plugin
|
||||
:module module object loaded from importlib
|
||||
:name name of the functiion of interest
|
||||
"""
|
||||
|
||||
p = type(getattr(module,name)).__name__ =='function'
|
||||
q = hasattr(getattr(module,name),'transport')
|
||||
#
|
||||
# @TODO: add a generated key, and more indepth validation
|
||||
return p and q
|
||||
def has(self,_name):
|
||||
"""
|
||||
This will determine if the module name is loaded or not
|
||||
"""
|
||||
return _name in self._modules
|
||||
def ratio (self):
|
||||
"""
|
||||
This functiion determines how many modules loaded vs unloaded given the list of names
|
||||
"""
|
||||
|
||||
_n = len(self._names)
|
||||
return len(set(self._modules.keys()) & set (self._names)) / _n
|
||||
def apply(self,_data,_logger=[]):
|
||||
_input= {}
|
||||
|
||||
for _name in self._modules :
|
||||
try:
|
||||
_input = {'action':'plugin','object':_name,'input':{'status':'PASS'}}
|
||||
_pointer = self._modules[_name]
|
||||
if type(_data) == list :
|
||||
_data = pd.DataFrame(_data)
|
||||
_brow,_bcol = list(_data.shape)
|
||||
|
||||
#
|
||||
# @TODO: add exception handling
|
||||
_data = _pointer(_data)
|
||||
|
||||
_input['input']['shape'] = {'rows-dropped':_brow - _data.shape[0]}
|
||||
except Exception as e:
|
||||
_input['input']['status'] = 'FAILED'
|
||||
print (e)
|
||||
time.sleep(1)
|
||||
if _logger:
|
||||
try:
|
||||
_logger(**_input)
|
||||
except Exception as e:
|
||||
pass
|
||||
return _data
|
||||
# def apply(self,_data,_name):
|
||||
# """
|
||||
# This function applies an external module function against the data.
|
||||
# The responsibility is on the plugin to properly return data, thus responsibility is offloaded
|
||||
# """
|
||||
# try:
|
||||
|
||||
# _pointer = self._modules[_name]
|
||||
# _data = _pointer(_data)
|
||||
|
||||
# except Exception as e:
|
||||
# pass
|
||||
# return _data
|
@ -0,0 +1,52 @@
|
||||
"""
|
||||
This file is intended to aggregate all we can about the framework in terms of support
|
||||
"""
|
||||
|
||||
BIGQUERY='bigquery'
|
||||
|
||||
POSTGRESQL = 'postgresql'
|
||||
MONGODB = 'mongodb'
|
||||
HTTP='http'
|
||||
BIGQUERY ='bigquery'
|
||||
FILE = 'file'
|
||||
ETL = 'etl'
|
||||
|
||||
SQLITE = 'sqlite3'
|
||||
SQLITE3= 'sqlite3'
|
||||
DUCKDB = 'duckdb'
|
||||
|
||||
REDSHIFT = 'redshift'
|
||||
NETEZZA = 'netezza'
|
||||
MYSQL = 'mysql'
|
||||
MARIADB= MYSQL
|
||||
|
||||
COUCHDB = 'couchdb'
|
||||
CONSOLE = 'console'
|
||||
ETL = 'etl'
|
||||
TRANSPORT = ETL
|
||||
NEXTCLOUD = 'nextcloud'
|
||||
S3 = 's3'
|
||||
CALLBACK = 'callback'
|
||||
CONSOLE = 'console'
|
||||
RABBITMQ = 'rabbitmq'
|
||||
DATABRICKS = 'databricks'
|
||||
MSSQL ='sqlserver'
|
||||
SQLSERVER ='sqlserver'
|
||||
|
||||
#
|
||||
# synonyms of the above
|
||||
BQ = BIGQUERY
|
||||
MONGO = MONGODB
|
||||
FERRETDB= MONGODB
|
||||
PG = POSTGRESQL
|
||||
PSQL = POSTGRESQL
|
||||
PGSQL = POSTGRESQL
|
||||
|
||||
AWS_S3 = 's3'
|
||||
RABBIT = RABBITMQ
|
||||
ICEBERG='iceberg'
|
||||
APACHE_ICEBERG = 'iceberg'
|
||||
DRILL = 'drill'
|
||||
APACHE_DRILL = 'drill'
|
||||
# QLISTENER = 'qlistener'
|
||||
|
@ -0,0 +1,115 @@
|
||||
import os
|
||||
import json
|
||||
from info import __version__
|
||||
import copy
|
||||
import transport
|
||||
import importlib
|
||||
import importlib.util
|
||||
import shutil
|
||||
from io import StringIO
|
||||
|
||||
"""
|
||||
This class manages data from the registry and allows (read only)
|
||||
@TODO: add property to the DATA attribute
|
||||
"""
|
||||
if 'HOME' in os.environ :
|
||||
REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport'])
|
||||
else:
|
||||
REGISTRY_PATH=os.sep.join([os.environ['USERPROFILE'],'.data-transport'])
|
||||
|
||||
#
|
||||
# This path can be overriden by an environment variable ...
|
||||
#
|
||||
if 'DATA_TRANSPORT_REGISTRY_PATH' in os.environ :
|
||||
REGISTRY_PATH = os.environ['DATA_TRANSPORT_REGISTRY_PATH']
|
||||
REGISTRY_FILE= 'transport-registry.json'
|
||||
DATA = {}
|
||||
|
||||
|
||||
def isloaded ():
|
||||
return DATA not in [{},None]
|
||||
def exists (path=REGISTRY_PATH,_file=REGISTRY_FILE) :
|
||||
"""
|
||||
This function determines if there is a registry at all
|
||||
"""
|
||||
p = os.path.exists(path)
|
||||
q = os.path.exists( os.sep.join([path,_file]))
|
||||
|
||||
return p and q
|
||||
def load (_path=REGISTRY_PATH,_file=REGISTRY_FILE):
|
||||
global DATA
|
||||
|
||||
if exists(_path) :
|
||||
path = os.sep.join([_path,_file])
|
||||
f = open(path)
|
||||
DATA = json.loads(f.read())
|
||||
f.close()
|
||||
def init (email,path=REGISTRY_PATH,override=False,_file=REGISTRY_FILE):
|
||||
"""
|
||||
Initializing the registry and will raise an exception in the advent of an issue
|
||||
"""
|
||||
p = '@' in email
|
||||
#q = False if '.' not in email else email.split('.')[-1] in ['edu','com','io','ai','org']
|
||||
q = len(email.split('.')[-1]) in [2,3]
|
||||
if p and q :
|
||||
_config = {"email":email,'version':__version__}
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
filename = os.sep.join([path,_file])
|
||||
if not os.path.exists(filename) or override == True :
|
||||
|
||||
f = open(filename,'w')
|
||||
f.write( json.dumps(_config))
|
||||
f.close()
|
||||
# _msg = f"""{CHECK_MARK} Successfully wrote configuration to {path} from {email}"""
|
||||
|
||||
else:
|
||||
raise Exception (f"""Unable to write configuration, Please check parameters (or help) and try again""")
|
||||
else:
|
||||
raise Exception (f"""Invalid Input, {email} is not well formatted, provide an email with adequate format""")
|
||||
def lookup (label):
|
||||
global DATA
|
||||
return label in DATA
|
||||
has = lookup
|
||||
|
||||
def get (label='default') :
|
||||
global DATA
|
||||
return copy.copy(DATA[label]) if label in DATA else {}
|
||||
|
||||
def set (label, auth_file, default=False,path=REGISTRY_PATH) :
|
||||
"""
|
||||
This function will add a label (auth-file data) into the registry and can set it as the default
|
||||
"""
|
||||
if label == 'default' :
|
||||
raise Exception ("""Invalid label name provided, please change the label name and use the switch""")
|
||||
reg_file = os.sep.join([path,REGISTRY_FILE])
|
||||
if os.path.exists(path) and os.path.exists(reg_file):
|
||||
if type(auth_file) == str and os.path.exists (auth_file) :
|
||||
f = open(auth_file)
|
||||
elif type(auth_file) == StringIO:
|
||||
f = auth_file
|
||||
_info = json.loads(f.read())
|
||||
f.close()
|
||||
f = open(reg_file)
|
||||
_config = json.loads(f.read())
|
||||
f.close()
|
||||
|
||||
#
|
||||
# set the proposed label
|
||||
_object = transport.factory.instance(**_info)
|
||||
if _object :
|
||||
_config[label] = _info
|
||||
if default :
|
||||
_config['default'] = _info
|
||||
#
|
||||
# now we need to write this to the location
|
||||
f = open(reg_file,'w')
|
||||
f.write(json.dumps(_config))
|
||||
f.close()
|
||||
else:
|
||||
raise Exception( f"""Unable to load file locate at {path},\nLearn how to generate auth-file with wizard found at https://healthcareio.the-phi.com/data-transport""")
|
||||
pass
|
||||
else:
|
||||
pass
|
||||
pass
|
||||
|
@ -0,0 +1,18 @@
|
||||
"""
|
||||
This namespace/package wrap the sql functionalities for a certain data-stores
|
||||
- netezza, postgresql, mysql and sqlite
|
||||
- mariadb, redshift (also included)
|
||||
"""
|
||||
from . import postgresql, mysql, netezza, sqlite, sqlserver, duckdb
|
||||
|
||||
|
||||
#
|
||||
# Creating aliases for support of additional data-store providerss
|
||||
#
|
||||
mariadb = mysql
|
||||
redshift = postgresql
|
||||
sqlite3 = sqlite
|
||||
|
||||
|
||||
# from transport import sql
|
||||
|
@ -0,0 +1,169 @@
|
||||
"""
|
||||
This file encapsulates common operations associated with SQL databases via SQLAlchemy
|
||||
|
||||
"""
|
||||
import sqlalchemy as sqa
|
||||
from sqlalchemy import text , MetaData, inspect
|
||||
|
||||
import pandas as pd
|
||||
|
||||
class Base:
|
||||
def __init__(self,**_args):
|
||||
self._host = _args['host'] if 'host' in _args else 'localhost'
|
||||
self._port = None
|
||||
self._database = _args['database']
|
||||
self._table = _args['table'] if 'table' in _args else None
|
||||
_uri = self._get_uri(**_args)
|
||||
if type(_uri) == str :
|
||||
self._engine= sqa.create_engine(_uri,future=True)
|
||||
else:
|
||||
|
||||
_uri,_kwargs = _uri
|
||||
self._engine= sqa.create_engine(_uri,**_kwargs,future=True)
|
||||
def _set_uri(self,**_args) :
|
||||
"""
|
||||
:provider provider
|
||||
:host host and port
|
||||
:account account user/pwd
|
||||
"""
|
||||
_account = _args['account'] if 'account' in _args else None
|
||||
_host = _args['host']
|
||||
_provider = _args['provider'].replace(':','').replace('/','').strip()
|
||||
def _get_uri(self,**_args):
|
||||
"""
|
||||
This function will return the formatted uri for the sqlAlchemy engine
|
||||
"""
|
||||
raise Exception ("Function Needs to be implemented ")
|
||||
def meta (self,**_args):
|
||||
"""
|
||||
This function returns the schema (table definition) of a given table
|
||||
:table optional name of the table (can be fully qualified)
|
||||
"""
|
||||
_table = self._table if 'table' not in _args else _args['table']
|
||||
_map = {'TINYINT':'INTEGER','BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'}
|
||||
_schema = []
|
||||
# if _table :
|
||||
# if sqa.__version__.startswith('1.') :
|
||||
# _handler = sqa.MetaData(bind=self._engine)
|
||||
# _handler.reflect()
|
||||
# else:
|
||||
# #
|
||||
# # sqlalchemy's version 2.+
|
||||
# _handler = sqa.MetaData()
|
||||
# _handler.reflect(bind=self._engine)
|
||||
# #
|
||||
# # Let us extract the schema with the native types
|
||||
# _map = {'BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'}
|
||||
# _schema = [{"name":_attr.name,"type":_map.get(str(_attr.type),str(_attr.type))} for _attr in _handler.tables[_table].columns]
|
||||
#
|
||||
try:
|
||||
if _table :
|
||||
_inspector = inspect(self._engine)
|
||||
_columns = _inspector.get_columns(_table)
|
||||
_schema = [{'name':column['name'],'type':_map.get(str(column['type']),str(column['type'])) } for column in _columns]
|
||||
return _schema
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
# else:
|
||||
return []
|
||||
def has(self,**_args):
|
||||
return self.meta(**_args)
|
||||
def apply(self,sql):
|
||||
"""
|
||||
Executing sql statement that returns query results (hence the restriction on sql and/or with)
|
||||
:sql SQL query to be exectued
|
||||
|
||||
@TODO: Execution of stored procedures
|
||||
"""
|
||||
if sql.strip().lower().startswith('select') or sql.strip().lower().startswith('with') or sql.strip().startswith('show'):
|
||||
|
||||
return pd.read_sql(sql,self._engine)
|
||||
else:
|
||||
_handler = self._engine.connect()
|
||||
_handler.execute(text(sql))
|
||||
_handler.commit ()
|
||||
_handler.close()
|
||||
return None
|
||||
|
||||
class SQLBase(Base):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
self._schema = _args.get('schema',None)
|
||||
def get_provider(self):
|
||||
raise Exception ("Provider Needs to be set ...")
|
||||
def get_default_port(self) :
|
||||
raise Exception ("default port needs to be set")
|
||||
|
||||
def _get_uri(self,**_args):
|
||||
_host = self._host
|
||||
_account = ''
|
||||
if self._port :
|
||||
_port = self._port
|
||||
else:
|
||||
_port = self.get_default_port()
|
||||
|
||||
_host = f'{_host}:{_port}'
|
||||
|
||||
if 'username' in _args :
|
||||
_account = ''.join([_args['username'],':',_args['password'],'@'])
|
||||
_database = self._database
|
||||
_provider = self.get_provider().replace(':','').replace('/','')
|
||||
# _uri = [f'{_provider}:/',_account,_host,_database]
|
||||
# _uri = [_item.strip() for _item in _uri if _item.strip()]
|
||||
# return '/'.join(_uri)
|
||||
return f'{_provider}://{_host}/{_database}' if _account == '' else f'{_provider}://{_account}{_host}/{_database}'
|
||||
def close(self,) :
|
||||
try:
|
||||
self._engine.dispose()
|
||||
except :
|
||||
pass
|
||||
class BaseReader(SQLBase):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
def read(self,**_args):
|
||||
"""
|
||||
This function will read a query or table from the specific database
|
||||
"""
|
||||
if 'sql' in _args :
|
||||
sql = _args['sql']
|
||||
else:
|
||||
_table = _args['table'] if 'table' in _args else self._table
|
||||
if self._schema and type(self._schema) == str :
|
||||
_table = f'{self._schema}.{_table}'
|
||||
sql = f'SELECT * FROM {_table}'
|
||||
return self.apply(sql)
|
||||
|
||||
|
||||
class BaseWriter (SQLBase):
|
||||
"""
|
||||
This class implements SQLAlchemy support for Writting to a data-store (RDBMS)
|
||||
"""
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
|
||||
def write(self,_data,**_args):
|
||||
|
||||
if type(_data) == dict :
|
||||
_df = pd.DataFrame([_data])
|
||||
elif type(_data) == list :
|
||||
_df = pd.DataFrame(_data)
|
||||
else:
|
||||
_df = _data.copy()
|
||||
#
|
||||
# We are assuming we have a data-frame at this point
|
||||
#
|
||||
_table = _args['table'] if 'table' in _args else self._table
|
||||
_mode = {'chunksize':2000000,'if_exists':'append','index':False}
|
||||
for key in ['if_exists','index','chunksize'] :
|
||||
if key in _args :
|
||||
_mode[key] = _args[key]
|
||||
# if 'schema' in _args :
|
||||
# _mode['schema'] = _args['schema']
|
||||
# if 'if_exists' in _args :
|
||||
# _mode['if_exists'] = _args['if_exists']
|
||||
if 'schema' in _args and type(_args['schema']) == str:
|
||||
self._schema = _args.get('schema',None)
|
||||
if self._schema :
|
||||
_mode['schema'] = self._schema
|
||||
_df.to_sql(_table,self._engine,**_mode)
|
@ -0,0 +1,26 @@
|
||||
"""
|
||||
This module implements the handler for duckdb (in memory or not)
|
||||
"""
|
||||
from transport.sql.common import Base, BaseReader, BaseWriter
|
||||
|
||||
class Duck :
|
||||
def __init__(self,**_args):
|
||||
#
|
||||
# duckdb with none as database will operate as an in-memory database
|
||||
#
|
||||
self.database = _args['database'] if 'database' in _args else ''
|
||||
def get_provider(self):
|
||||
return "duckdb"
|
||||
|
||||
def _get_uri(self,**_args):
|
||||
return f"""duckdb:///{self.database}"""
|
||||
class Reader(Duck,BaseReader) :
|
||||
def __init__(self,**_args):
|
||||
Duck.__init__(self,**_args)
|
||||
BaseReader.__init__(self,**_args)
|
||||
def _get_uri(self,**_args):
|
||||
return super()._get_uri(**_args),{'connect_args':{'read_only':True}}
|
||||
class Writer(Duck,BaseWriter):
|
||||
def __init__(self,**_args):
|
||||
Duck.__init__(self,**_args)
|
||||
BaseWriter.__init__(self,**_args)
|
@ -0,0 +1,18 @@
|
||||
"""
|
||||
This file implements support for mysql and maria db (with drivers mysql+mysql)
|
||||
"""
|
||||
from transport.sql.common import BaseReader, BaseWriter
|
||||
# import mysql.connector as my
|
||||
class MYSQL:
|
||||
|
||||
def get_provider(self):
|
||||
return "mysql+mysqlconnector"
|
||||
def get_default_port(self):
|
||||
return "3306"
|
||||
class Reader(MYSQL,BaseReader) :
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
|
||||
class Writer(MYSQL,BaseWriter) :
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
@ -0,0 +1,15 @@
|
||||
import nzpy as nz
|
||||
from transport.sql.common import BaseReader, BaseWriter
|
||||
|
||||
class Netezza:
|
||||
def get_provider(self):
|
||||
return 'netezza+nzpy'
|
||||
def get_default_port(self):
|
||||
return '5480'
|
||||
|
||||
class Reader(Netezza,BaseReader) :
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
class Writer(Netezza,BaseWriter):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
@ -0,0 +1,22 @@
|
||||
|
||||
from transport.sql.common import BaseReader , BaseWriter
|
||||
from psycopg2.extensions import register_adapter, AsIs
|
||||
import numpy as np
|
||||
|
||||
register_adapter(np.int64, AsIs)
|
||||
|
||||
class PG:
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
def get_provider(self):
|
||||
return "postgresql"
|
||||
|
||||
def get_default_port(self):
|
||||
return "5432"
|
||||
class Reader(PG,BaseReader) :
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
class Writer(PG,BaseWriter):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
|
@ -0,0 +1,25 @@
|
||||
import sqlalchemy
|
||||
import pandas as pd
|
||||
from transport.sql.common import Base, BaseReader, BaseWriter
|
||||
class SQLite (BaseReader):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
if 'path' in _args :
|
||||
self._database = _args['path']
|
||||
if 'database' in _args :
|
||||
self._database = _args['database']
|
||||
def _get_uri(self,**_args):
|
||||
path = self._database
|
||||
return f'sqlite:///{path}' # ensure this is the correct path for the sqlite file.
|
||||
|
||||
class Reader(SQLite,BaseReader):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
# def read(self,**_args):
|
||||
# sql = _args['sql']
|
||||
# return pd.read_sql(sql,self._engine)
|
||||
|
||||
|
||||
class Writer (SQLite,BaseWriter):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
@ -0,0 +1,24 @@
|
||||
"""
|
||||
Handling Microsoft SQL Server via pymssql driver/connector
|
||||
"""
|
||||
import sqlalchemy
|
||||
import pandas as pd
|
||||
from transport.sql.common import Base, BaseReader, BaseWriter
|
||||
|
||||
|
||||
class MsSQLServer:
|
||||
def __init__(self,**_args) :
|
||||
super().__init__(**_args)
|
||||
pass
|
||||
def get_provider(self):
|
||||
# mssql+pymssql://scott:tiger@hostname:port/dbname"
|
||||
return "mssql+pymssql"
|
||||
def get_default_port(self):
|
||||
return "1433"
|
||||
class Reader (MsSQLServer,BaseReader):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
|
||||
class Writer (MsSQLServer,BaseWriter):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
@ -0,0 +1,7 @@
|
||||
"""
|
||||
This namespace/package is intended to handle read/writes against data warehouse solutions like :
|
||||
- apache iceberg
|
||||
- clickhouse (...)
|
||||
"""
|
||||
|
||||
from . import iceberg, drill
|
@ -0,0 +1,55 @@
|
||||
import sqlalchemy
|
||||
import pandas as pd
|
||||
from .. sql.common import BaseReader , BaseWriter
|
||||
import sqlalchemy as sqa
|
||||
|
||||
class Drill :
|
||||
__template = {'host':None,'port':None,'ssl':None,'table':None,'database':None}
|
||||
def __init__(self,**_args):
|
||||
|
||||
self._host = _args['host'] if 'host' in _args else 'localhost'
|
||||
self._port = _args['port'] if 'port' in _args else self.get_default_port()
|
||||
self._ssl = False if 'ssl' not in _args else _args['ssl']
|
||||
|
||||
self._table = _args['table'] if 'table' in _args else None
|
||||
if self._table and '.' in self._table :
|
||||
_seg = self._table.split('.')
|
||||
if len(_seg) > 2 :
|
||||
self._schema,self._database = _seg[:2]
|
||||
else:
|
||||
|
||||
self._database=_args['database']
|
||||
self._schema = self._database.split('.')[0]
|
||||
|
||||
def _get_uri(self,**_args):
|
||||
return f'drill+sadrill://{self._host}:{self._port}/{self._database}?use_ssl={self._ssl}'
|
||||
def get_provider(self):
|
||||
return "drill+sadrill"
|
||||
def get_default_port(self):
|
||||
return "8047"
|
||||
def meta(self,**_args):
|
||||
_table = _args['table'] if 'table' in _args else self._table
|
||||
if '.' in _table :
|
||||
_schema = _table.split('.')[:2]
|
||||
_schema = '.'.join(_schema)
|
||||
_table = _table.split('.')[-1]
|
||||
else:
|
||||
_schema = self._schema
|
||||
|
||||
# _sql = f"select COLUMN_NAME AS name, CASE WHEN DATA_TYPE ='CHARACTER VARYING' THEN 'CHAR ( 125 )' ELSE DATA_TYPE END AS type from INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA='{_schema}' and TABLE_NAME='{_table}'"
|
||||
_sql = f"select COLUMN_NAME AS name, CASE WHEN DATA_TYPE ='CHARACTER VARYING' THEN 'CHAR ( '||COLUMN_SIZE||' )' ELSE DATA_TYPE END AS type from INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA='{_schema}' and TABLE_NAME='{_table}'"
|
||||
try:
|
||||
_df = pd.read_sql(_sql,self._engine)
|
||||
return _df.to_dict(orient='records')
|
||||
except Exception as e:
|
||||
print (e)
|
||||
pass
|
||||
return []
|
||||
class Reader (Drill,BaseReader) :
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
self._chunksize = 0 if 'chunksize' not in _args else _args['chunksize']
|
||||
self._engine= sqa.create_engine(self._get_uri(),future=True)
|
||||
class Writer(Drill,BaseWriter):
|
||||
def __init__(self,**_args):
|
||||
super().__init__(self,**_args)
|
@ -0,0 +1,151 @@
|
||||
"""
|
||||
dependency:
|
||||
- spark and SPARK_HOME environment variable must be set
|
||||
NOTE:
|
||||
When using streaming option, insure that it is inline with default (1000 rows) or increase it in spark-defaults.conf
|
||||
|
||||
"""
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark import SparkContext
|
||||
from pyspark.sql.types import *
|
||||
from pyspark.sql.functions import col, to_date, to_timestamp
|
||||
import copy
|
||||
|
||||
class Iceberg :
|
||||
def __init__(self,**_args):
|
||||
"""
|
||||
providing catalog meta information (you must get this from apache iceberg)
|
||||
"""
|
||||
#
|
||||
# Turning off logging (it's annoying & un-professional)
|
||||
#
|
||||
# _spconf = SparkContext()
|
||||
# _spconf.setLogLevel("ERROR")
|
||||
#
|
||||
# @TODO:
|
||||
# Make arrangements for additional configuration elements
|
||||
#
|
||||
self._session = SparkSession.builder.appName("data-transport").getOrCreate()
|
||||
self._session.conf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS")
|
||||
# self._session.sparkContext.setLogLevel("ERROR")
|
||||
self._catalog = self._session.catalog
|
||||
self._table = _args['table'] if 'table' in _args else None
|
||||
|
||||
if 'catalog' in _args :
|
||||
#
|
||||
# Let us set the default catalog
|
||||
self._catalog.setCurrentCatalog(_args['catalog'])
|
||||
|
||||
else:
|
||||
# No current catalog has been set ...
|
||||
pass
|
||||
if 'database' in _args :
|
||||
self._database = _args['database']
|
||||
self._catalog.setCurrentDatabase(self._database)
|
||||
else:
|
||||
#
|
||||
# Should we set the default as the first one if available ?
|
||||
#
|
||||
pass
|
||||
self._catalogName = self._catalog.currentCatalog()
|
||||
self._databaseName = self._catalog.currentDatabase()
|
||||
def meta (self,**_args) :
|
||||
"""
|
||||
This function should return the schema of a table (only)
|
||||
"""
|
||||
_schema = []
|
||||
try:
|
||||
_table = _args['table'] if 'table' in _args else self._table
|
||||
_tableName = self._getPrefix(**_args) + f".{_table}"
|
||||
_tmp = self._session.table(_tableName).schema
|
||||
_schema = _tmp.jsonValue()['fields']
|
||||
for _item in _schema :
|
||||
del _item['nullable'],_item['metadata']
|
||||
except Exception as e:
|
||||
|
||||
pass
|
||||
return _schema
|
||||
def _getPrefix (self,**_args):
|
||||
_catName = self._catalogName if 'catalog' not in _args else _args['catalog']
|
||||
_datName = self._databaseName if 'database' not in _args else _args['database']
|
||||
|
||||
return '.'.join([_catName,_datName])
|
||||
def apply(self,_query):
|
||||
"""
|
||||
sql query/command to run against apache iceberg
|
||||
"""
|
||||
return self._session.sql(_query).toPandas()
|
||||
def has (self,**_args):
|
||||
try:
|
||||
_prefix = self._getPrefix(**_args)
|
||||
if _prefix.endswith('.') :
|
||||
return False
|
||||
return _args['table'] in [_item.name for _item in self._catalog.listTables(_prefix)]
|
||||
except Exception as e:
|
||||
print (e)
|
||||
return False
|
||||
|
||||
def close(self):
|
||||
self._session.stop()
|
||||
class Reader(Iceberg) :
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
def read(self,**_args):
|
||||
_table = self._table
|
||||
_prefix = self._getPrefix(**_args)
|
||||
if 'table' in _args or _table:
|
||||
_table = _args['table'] if 'table' in _args else _table
|
||||
_table = _prefix + f'.{_table}'
|
||||
return self._session.table(_table).toPandas()
|
||||
else:
|
||||
sql = _args['sql']
|
||||
return self._session.sql(sql).toPandas()
|
||||
pass
|
||||
class Writer (Iceberg):
|
||||
"""
|
||||
Writing data to an Apache Iceberg data warehouse (using pyspark)
|
||||
"""
|
||||
def __init__(self,**_args):
|
||||
super().__init__(**_args)
|
||||
self._mode = 'append' if 'mode' not in _args else _args['mode']
|
||||
self._table = None if 'table' not in _args else _args['table']
|
||||
def format (self,_schema) :
|
||||
_iceSchema = StructType([])
|
||||
_map = {'integer':IntegerType(),'float':DoubleType(),'double':DoubleType(),'date':DateType(),
|
||||
'timestamp':TimestampType(),'datetime':TimestampType(),'string':StringType(),'varchar':StringType()}
|
||||
for _item in _schema :
|
||||
_name = _item['name']
|
||||
_type = _item['type'].lower()
|
||||
if _type not in _map :
|
||||
_iceType = StringType()
|
||||
else:
|
||||
_iceType = _map[_type]
|
||||
|
||||
_iceSchema.add (StructField(_name,_iceType,True))
|
||||
return _iceSchema if len(_iceSchema) else []
|
||||
def write(self,_data,**_args):
|
||||
_prefix = self._getPrefix(**_args)
|
||||
if 'table' not in _args and not self._table :
|
||||
raise Exception (f"Table Name should be specified for catalog/database {_prefix}")
|
||||
_schema = self.format(_args['schema']) if 'schema' in _args else []
|
||||
if not _schema :
|
||||
rdd = self._session.createDataFrame(_data,verifySchema=False)
|
||||
else :
|
||||
rdd = self._session.createDataFrame(_data,schema=_schema,verifySchema=True)
|
||||
_mode = self._mode if 'mode' not in _args else _args['mode']
|
||||
_table = self._table if 'table' not in _args else _args['table']
|
||||
|
||||
# print (_data.shape,_mode,_table)
|
||||
|
||||
if not self._session.catalog.tableExists(_table):
|
||||
# # @TODO:
|
||||
# # add partitioning information here
|
||||
rdd.writeTo(_table).using('iceberg').create()
|
||||
|
||||
# # _mode = 'overwrite'
|
||||
# # rdd.write.format('iceberg').mode(_mode).saveAsTable(_table)
|
||||
else:
|
||||
# rdd.writeTo(_table).append()
|
||||
# # _table = f'{_prefix}.{_table}'
|
||||
|
||||
rdd.coalesce(10).write.format('iceberg').mode('append').save(_table)
|
Loading…
Reference in new issue