diff --git a/README.md b/README.md index 7d8b414..577350e 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,11 @@ This project implements an abstraction of objects that can have access to a vari # Why Use Data-Transport ? -Mostly data scientists that don't really care about the underlying database and would like a simple and consistent way to read/write and move data are well served. Additionally we implemented lightweight Extract Transform Loading API and command line (CLI) tool. Finally it is possible to add pre/post processing pipeline functions to read/write - -1. Familiarity with **pandas data-frames** -2. Connectivity **drivers** are included -3. Reading/Writing data from various sources -4. Useful for data migrations or **ETL** +Data transport is a simple framework that: +- easy to install & modify (open-source) +- enables access to multiple database technologies (pandas, SQLAlchemy) +- enables notebook sharing without exposing database credential. +- supports pre/post processing specifications (pipeline) ## Installation @@ -18,19 +17,16 @@ Within the virtual environment perform the following : pip install git+https://github.com/lnyemba/data-transport.git -## Features +Options to install components in square brackets - - read/write from over a dozen databases - - run ETL jobs seamlessly - - scales and integrates into shared environments like apache zeppelin; jupyterhub; SageMaker; ... + pip install data-transport[nosql,cloud,warehouse,all]@git+https://github.com/lnyemba/data-transport.git -## What's new -Unlike older versions 2.0 and under, we focus on collaborative environments like jupyter-x servers; apache zeppelin: +## Additional features - 1. Simpler syntax to create reader or writer - 2. auth-file registry that can be referenced using a label - 3. duckdb support + - In addition to read/write, there is support for functions for pre/post processing + - CLI interface to add to registry, run ETL + - scales and integrates into shared environments like apache zeppelin; jupyterhub; SageMaker; ... ## Learn More diff --git a/pyproject.toml b/pyproject.toml index 159e9cb..742915d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,25 +19,16 @@ classifiers = [ dependencies = [ "termcolor","sqlalchemy", "aiosqlite","duckdb-engine", "mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite", - "typer","pandas","numpy","sqlalchemy","pyarrow", + "typer","pandas","numpy","sqlalchemy","pyarrow","smart-open", "plugin-ix@git+https://github.com/lnyemba/plugins-ix" ] [project.optional-dependencies] -sql = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite"] +#sql = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite"] nosql = ["pymongo","cloudant"] -cloud = ["pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore"] +cloud = ["boto","boto3","botocore","pyncclient","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore"] warehouse = ["pydrill","pyspark","sqlalchemy_drill"] -rabbitmq = ["pika"] -sqlite = ["aiosqlite"] -aws3 = ["boto3","boto","botocore"] -nextcloud = ["pyncclient"] -mongodb = ["pymongo"] -netezza = ["nzpy"] -mysql = ["mysql-connector-python"] -postgresql = ["psycopg2-binary"] -sqlserver = ["pymssql"] -http = ["flask-session"] -all = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite","pymongo","cloudant","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore","pydrill","pyspark","sqlalchemy_drill", "pika","aiosqlite","boto3","boto","botocore", "pyncclient"] +other = ["pika","flask-session"] +all = ["pymongo","cloudant","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore","pydrill","pyspark","sqlalchemy_drill", "pika","aiosqlite","boto3","boto","botocore", "pyncclient"] [project.urls] Homepage = "https://healthcareio.the-phi.com/git/code/transport.git" diff --git a/transport/__init__.py b/transport/__init__.py index c3bb901..8ace570 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -18,7 +18,26 @@ Source Code is available under MIT License: """ import numpy as np -from transport import sql, nosql, cloud, other, warehouse +#from transport import sql, nosql, cloud, other, warehouse +from transport import sql +try: + from transport import nosql +except Exception as e: + nosql = {} +try: + from transport import cloud +except Exception as e: + cloud = {} +try: + from transport import warehouse +except Exception as e: + warehouse= {} +try: + from transport import other +except Exception as e: + other = {} + + import pandas as pd import json import os @@ -35,7 +54,7 @@ def init(): global PROVIDERS for _module in [cloud,sql,nosql,other,warehouse] : for _provider_name in dir(_module) : - if _provider_name.startswith('__') or _provider_name == 'common': + if _provider_name.startswith('__') or _provider_name == 'common' or type(_module) in [None,str,dict]: continue PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__} #