pull/1/head
Steve Nyemba 5 months ago
parent e0e48a3d02
commit d154ac3cd0

@ -4,12 +4,11 @@ This project implements an abstraction of objects that can have access to a vari
# Why Use Data-Transport ? # Why Use Data-Transport ?
Mostly data scientists that don't really care about the underlying database and would like a simple and consistent way to read/write and move data are well served. Additionally we implemented lightweight Extract Transform Loading API and command line (CLI) tool. Finally it is possible to add pre/post processing pipeline functions to read/write Data transport is a simple framework that:
- easy to install & modify (open-source)
1. Familiarity with **pandas data-frames** - enables access to multiple database technologies (pandas, SQLAlchemy)
2. Connectivity **drivers** are included - enables notebook sharing without exposing database credential.
3. Reading/Writing data from various sources - supports pre/post processing specifications (pipeline)
4. Useful for data migrations or **ETL**
## Installation ## Installation
@ -18,19 +17,16 @@ Within the virtual environment perform the following :
pip install git+https://github.com/lnyemba/data-transport.git pip install git+https://github.com/lnyemba/data-transport.git
## Features Options to install components in square brackets
- read/write from over a dozen databases pip install data-transport[nosql,cloud,warehouse,all]@git+https://github.com/lnyemba/data-transport.git
- run ETL jobs seamlessly
- scales and integrates into shared environments like apache zeppelin; jupyterhub; SageMaker; ...
## What's new
Unlike older versions 2.0 and under, we focus on collaborative environments like jupyter-x servers; apache zeppelin: ## Additional features
1. Simpler syntax to create reader or writer - In addition to read/write, there is support for functions for pre/post processing
2. auth-file registry that can be referenced using a label - CLI interface to add to registry, run ETL
3. duckdb support - scales and integrates into shared environments like apache zeppelin; jupyterhub; SageMaker; ...
## Learn More ## Learn More

@ -19,25 +19,16 @@ classifiers = [
dependencies = [ dependencies = [
"termcolor","sqlalchemy", "aiosqlite","duckdb-engine", "termcolor","sqlalchemy", "aiosqlite","duckdb-engine",
"mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite", "mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite",
"typer","pandas","numpy","sqlalchemy","pyarrow", "typer","pandas","numpy","sqlalchemy","pyarrow","smart-open",
"plugin-ix@git+https://github.com/lnyemba/plugins-ix" "plugin-ix@git+https://github.com/lnyemba/plugins-ix"
] ]
[project.optional-dependencies] [project.optional-dependencies]
sql = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite"] #sql = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite"]
nosql = ["pymongo","cloudant"] nosql = ["pymongo","cloudant"]
cloud = ["pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore"] cloud = ["boto","boto3","botocore","pyncclient","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore"]
warehouse = ["pydrill","pyspark","sqlalchemy_drill"] warehouse = ["pydrill","pyspark","sqlalchemy_drill"]
rabbitmq = ["pika"] other = ["pika","flask-session"]
sqlite = ["aiosqlite"] all = ["pymongo","cloudant","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore","pydrill","pyspark","sqlalchemy_drill", "pika","aiosqlite","boto3","boto","botocore", "pyncclient"]
aws3 = ["boto3","boto","botocore"]
nextcloud = ["pyncclient"]
mongodb = ["pymongo"]
netezza = ["nzpy"]
mysql = ["mysql-connector-python"]
postgresql = ["psycopg2-binary"]
sqlserver = ["pymssql"]
http = ["flask-session"]
all = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite","pymongo","cloudant","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore","pydrill","pyspark","sqlalchemy_drill", "pika","aiosqlite","boto3","boto","botocore", "pyncclient"]
[project.urls] [project.urls]
Homepage = "https://healthcareio.the-phi.com/git/code/transport.git" Homepage = "https://healthcareio.the-phi.com/git/code/transport.git"

@ -18,7 +18,26 @@ Source Code is available under MIT License:
""" """
import numpy as np import numpy as np
from transport import sql, nosql, cloud, other, warehouse #from transport import sql, nosql, cloud, other, warehouse
from transport import sql
try:
from transport import nosql
except Exception as e:
nosql = {}
try:
from transport import cloud
except Exception as e:
cloud = {}
try:
from transport import warehouse
except Exception as e:
warehouse= {}
try:
from transport import other
except Exception as e:
other = {}
import pandas as pd import pandas as pd
import json import json
import os import os
@ -35,7 +54,7 @@ def init():
global PROVIDERS global PROVIDERS
for _module in [cloud,sql,nosql,other,warehouse] : for _module in [cloud,sql,nosql,other,warehouse] :
for _provider_name in dir(_module) : for _provider_name in dir(_module) :
if _provider_name.startswith('__') or _provider_name == 'common': if _provider_name.startswith('__') or _provider_name == 'common' or type(_module) in [None,str,dict]:
continue continue
PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__} PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__}
# #

Loading…
Cancel
Save