bug fix:

11 months ago · d154ac3cd0
parent e0e48a3d02
commit d154ac3cd0
3 changed files with 37 additions and 31 deletions
--- a/README.md
+++ b/README.md
@ -4,12 +4,11 @@ This project implements an abstraction of objects that can have access to a vari
 # Why Use Data-Transport ?
-Mostly data scientists that don't really care about the underlying database and would like a simple and consistent way to read/write and move data are well served. Additionally we implemented lightweight Extract Transform Loading API and command line (CLI) tool. Finally it is possible to add pre/post processing pipeline functions to read/write
+Data transport is a simple framework that:
-
+- easy to install & modify (open-source)
-1. Familiarity with **pandas data-frames**
+- enables access to multiple database technologies (pandas, SQLAlchemy)
-2. Connectivity **drivers** are included
+- enables notebook sharing without exposing database credential.
-3. Reading/Writing data from various sources
+- supports pre/post processing specifications (pipeline)
 4. Useful for data migrations or **ETL**
 ## Installation
@ -18,19 +17,16 @@ Within the virtual environment perform the following :
    pip install git+https://github.com/lnyemba/data-transport.git
-## Features
+Options to install components in square brackets
-    - read/write from over a dozen databases
+    pip install data-transport[nosql,cloud,warehouse,all]@git+https://github.com/lnyemba/data-transport.git
    - run ETL jobs seamlessly
    - scales and integrates into shared environments like apache zeppelin; jupyterhub; SageMaker; ...
 ## What's new
-Unlike older versions 2.0 and under, we focus on collaborative environments like jupyter-x servers; apache zeppelin:
+## Additional features
-    1. Simpler syntax to create reader or writer
+    - In addition to read/write, there is support for functions for pre/post processing
-    2. auth-file registry that can be referenced using a label
+    - CLI interface to add to registry, run ETL
-    3. duckdb support
+    - scales and integrates into shared environments like apache zeppelin; jupyterhub; SageMaker; ...
 ## Learn More
--- a/pyproject.toml
+++ b/pyproject.toml
@ -19,25 +19,16 @@ classifiers = [
 dependencies = [
    "termcolor","sqlalchemy", "aiosqlite","duckdb-engine",
    "mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite",
-    "typer","pandas","numpy","sqlalchemy","pyarrow",
+    "typer","pandas","numpy","sqlalchemy","pyarrow","smart-open",
    "plugin-ix@git+https://github.com/lnyemba/plugins-ix"
 ]
 [project.optional-dependencies]
-sql         = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite"]
+#sql         = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite"]
 nosql       = ["pymongo","cloudant"]
-cloud       = ["pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore"]
+cloud       = ["boto","boto3","botocore","pyncclient","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore"]
 warehouse   = ["pydrill","pyspark","sqlalchemy_drill"]
-rabbitmq    = ["pika"]
+other       = ["pika","flask-session"]
-sqlite      = ["aiosqlite"]
+all         = ["pymongo","cloudant","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore","pydrill","pyspark","sqlalchemy_drill", "pika","aiosqlite","boto3","boto","botocore", "pyncclient"]
 aws3        = ["boto3","boto","botocore"]
 nextcloud   =  ["pyncclient"]
 mongodb     = ["pymongo"]
 netezza     = ["nzpy"]
 mysql       = ["mysql-connector-python"]
 postgresql  = ["psycopg2-binary"]
 sqlserver   = ["pymssql"]
 http       = ["flask-session"]
 all         = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite","pymongo","cloudant","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore","pydrill","pyspark","sqlalchemy_drill", "pika","aiosqlite","boto3","boto","botocore", "pyncclient"]
 [project.urls]
 Homepage = "https://healthcareio.the-phi.com/git/code/transport.git"
--- a/transport/init.py
+++ b/transport/init.py
@ -18,7 +18,26 @@ Source Code is available under MIT License:
 """
 import numpy as np
-from transport import sql, nosql, cloud, other, warehouse
+#from transport import sql, nosql, cloud, other, warehouse
 from transport import sql 
 try:
    from transport import  nosql
 except Exception as e:
    nosql = {}
 try:
    from transport import  cloud
 except Exception as e:
    cloud = {}
 try:
    from transport import  warehouse
 except Exception as e:
    warehouse= {}
 try:
    from transport import  other
 except Exception as e:
    other = {}
 import pandas as pd
 import json
 import os
@ -35,7 +54,7 @@ def init():
    global PROVIDERS
    for _module in [cloud,sql,nosql,other,warehouse] :
        for _provider_name in dir(_module) :
-            if _provider_name.startswith('__') or _provider_name == 'common':
+            if _provider_name.startswith('__') or _provider_name == 'common' or type(_module) in [None,str,dict]:
                continue
            PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__}
    #