From a1cf78a8897809e0596d976600f436921b7d3bbc Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 18 Oct 2024 11:41:52 -0500 Subject: [PATCH] bug fixes: drill inheritance and met data function --- setup.py | 2 +- transport/duck.py | 19 ------------------- transport/providers/__init__.py | 3 ++- transport/sql/common.py | 4 +++- transport/warehouse/__init__.py | 2 +- transport/warehouse/iceberg.py | 12 ++++++++++++ 6 files changed, 19 insertions(+), 23 deletions(-) delete mode 100644 transport/duck.py diff --git a/setup.py b/setup.py index 7bb44e8..dcb52a0 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ args = { "packages": find_packages(include=['info','transport', 'transport.*'])} args["keywords"]=['mongodb','duckdb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pyncclient','duckdb-engine','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','termcolor','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql'] +args["install_requires"] = ['pyncclient','duckdb-engine','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','termcolor','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql','pyspark'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] # if sys.version_info[0] == 2 : diff --git a/transport/duck.py b/transport/duck.py deleted file mode 100644 index 7d580c9..0000000 --- a/transport/duck.py +++ /dev/null @@ -1,19 +0,0 @@ -""" -This file will be intended to handle duckdb database -""" - -import duckdb -from transport.common import Reader,Writer - -class Duck(Reader): - def __init__(self,**_args): - super().__init__(**_args) - self._path = None if 'path' not in _args else _args['path'] - self._handler = duckdb.connect() if not self._path else duckdb.connect(self._path) - - -class DuckReader(Duck) : - def __init__(self,**_args): - super().__init__(**_args) - def read(self,**_args) : - pass \ No newline at end of file diff --git a/transport/providers/__init__.py b/transport/providers/__init__.py index 35779d1..556df39 100644 --- a/transport/providers/__init__.py +++ b/transport/providers/__init__.py @@ -46,6 +46,7 @@ AWS_S3 = 's3' RABBIT = RABBITMQ ICEBERG='iceberg' APACHE_ICEBERG = 'iceberg' - +DRILL = 'drill' +APACHE_DRILL = 'drill' # QLISTENER = 'qlistener' \ No newline at end of file diff --git a/transport/sql/common.py b/transport/sql/common.py index 0a55ed7..cea285e 100644 --- a/transport/sql/common.py +++ b/transport/sql/common.py @@ -9,8 +9,9 @@ import pandas as pd class Base: def __init__(self,**_args): + # print ([' ## ',_args]) self._host = _args['host'] if 'host' in _args else 'localhost' - self._port = None + self._port = None if 'port' not in _args else _args['port'] self._database = _args['database'] self._table = _args['table'] if 'table' in _args else None self._engine= sqa.create_engine(self._get_uri(**_args),future=True) @@ -105,6 +106,7 @@ class BaseReader(SQLBase): if 'sql' in _args : sql = _args['sql'] else: + # print (dir (self)) _table = _args['table'] if 'table' in _args else self._table sql = f'SELECT * FROM {_table}' return self.apply(sql) diff --git a/transport/warehouse/__init__.py b/transport/warehouse/__init__.py index d82324e..bcd76fd 100644 --- a/transport/warehouse/__init__.py +++ b/transport/warehouse/__init__.py @@ -4,4 +4,4 @@ This namespace/package is intended to handle read/writes against data warehouse - clickhouse (...) """ -from . import iceberg \ No newline at end of file +from . import iceberg, drill \ No newline at end of file diff --git a/transport/warehouse/iceberg.py b/transport/warehouse/iceberg.py index 4f6eca0..a22c16e 100644 --- a/transport/warehouse/iceberg.py +++ b/transport/warehouse/iceberg.py @@ -1,4 +1,10 @@ +""" +dependency: + - spark and SPARK_HOME environment variable must be set +""" from pyspark.sql import SparkSession +from pyspark import SparkContext + import copy class Iceberg : @@ -7,10 +13,16 @@ class Iceberg : providing catalog meta information (you must get this from apache iceberg) """ # + # Turning off logging (it's annoying & un-professional) + # + # _spconf = SparkContext() + # _spconf.setLogLevel("ERROR") + # # @TODO: # Make arrangements for additional configuration elements # self._session = SparkSession.builder.getOrCreate() + # self._session.sparkContext.setLogLevel("ERROR") self._catalog = self._session.catalog self._table = _args['table'] if 'table' in _args else None