data-transport/transport/warehouse/iceberg.py

from pyspark.sql import SparkSession
import copy

class Iceberg :
    def __init__(self,**_args):
        """
        providing catalog meta information (you must get this from apache iceberg)
        """
        #
        # @TODO:
        #   Make arrangements for additional configuration elements 
        #
        self._session = SparkSession.builder.getOrCreate()
        self._catalog = self._session.catalog
        self._table = _args['table'] if 'table' in _args else None
        
        if 'catalog' in _args :
            #
            # Let us set the default catalog
            self._catalog.setCurrentCatalog(_args['catalog'])
            
        else:
            # No current catalog has been set ...
            pass
        if 'database' in _args :
            self._database = _args['database']
            self._catalog.setCurrentDatabase(self._database)
        else:
            #
            # Should we set the default as the first one if available ?
            #
            pass
        self._catalogName = self._catalog.currentCatalog()
        self._databaseName = self._catalog.currentDatabase()
    def meta (self,**_args) :
        """
        This function should return the schema of a table (only)
        """
        _schema = []
        try:
            _tableName = self._getPrefix(**_args) + f".{_args['table']}"
            print (_tableName)
            _tmp = self._session.table(_tableName).schema
            _schema = _tmp.jsonValue()['fields']
            for _item in _schema :
                del _item['nullable'],_item['metadata']
        except Exception as e:
            
            pass
        return _schema
    def _getPrefix (self,**_args):        
        _catName = self._catalogName if 'catalog' not in _args else _args['catalog']
        _datName = self._databaseName if 'database' not in _args else _args['database']
        
        return '.'.join([_catName,_datName])
            
    def has (self,**_args):
        try:
            _prefix = self._getPrefix(**_args)
            if _prefix.endswith('.') :
                return False
            return _args['table'] in [_item.name for _item in self._catalog.listTables(_prefix)]
        except Exception as e:
            print (e)
            return False
    def apply(self,sql):
        pass
class Reader(Iceberg) :
    def __init__(self,**_args):
        super().__init__(**_args)
    def read(self,**_args):
        _table = self._table
        _prefix = self._getPrefix(**_args)        
        if 'table' in _args or _table:
            _table = _args['table'] if 'table' in _args else _table
            _table = _prefix + f'.{_table}'
            return self._session.table(_table).toPandas()
        else:
            sql = _args['sql']
            return self._session.sql(sql).toPandas()
        pass
class Writer (Iceberg):
    """
    Writing data to an Apache Iceberg data warehouse (using pyspark)
    """
    def __init__(self,**_args):
        super().__init__(**_args)
        _prefix = self._getPrefix(**_args)
adding warehouse support (iceberg) 1 month ago			`from pyspark.sql import SparkSession`
			`import copy`

			`class Iceberg :`
			`def __init__(self,**_args):`
			`"""`
			`providing catalog meta information (you must get this from apache iceberg)`
			`"""`
			`#`
			`# @TODO:`
			`# Make arrangements for additional configuration elements`
			`#`
			`self._session = SparkSession.builder.getOrCreate()`
			`self._catalog = self._session.catalog`
			`self._table = _args['table'] if 'table' in _args else None`

			`if 'catalog' in _args :`
			`#`
			`# Let us set the default catalog`
			`self._catalog.setCurrentCatalog(_args['catalog'])`

			`else:`
			`# No current catalog has been set ...`
			`pass`
			`if 'database' in _args :`
			`self._database = _args['database']`
			`self._catalog.setCurrentDatabase(self._database)`
			`else:`
			`#`
			`# Should we set the default as the first one if available ?`
			`#`
			`pass`
			`self._catalogName = self._catalog.currentCatalog()`
			`self._databaseName = self._catalog.currentDatabase()`
			`def meta (self,**_args) :`
			`"""`
			`This function should return the schema of a table (only)`
			`"""`
			`_schema = []`
			`try:`
			`_tableName = self._getPrefix(**_args) + f".{_args['table']}"`
			`print (_tableName)`
			`_tmp = self._session.table(_tableName).schema`
			`_schema = _tmp.jsonValue()['fields']`
			`for _item in _schema :`
			`del _item['nullable'],_item['metadata']`
			`except Exception as e:`

			`pass`
			`return _schema`
			`def _getPrefix (self,**_args):`
			`_catName = self._catalogName if 'catalog' not in _args else _args['catalog']`
			`_datName = self._databaseName if 'database' not in _args else _args['database']`

			`return '.'.join([_catName,_datName])`

			`def has (self,**_args):`
			`try:`
			`_prefix = self._getPrefix(**_args)`
			`if _prefix.endswith('.') :`
			`return False`
			`return _args['table'] in [_item.name for _item in self._catalog.listTables(_prefix)]`
			`except Exception as e:`
			`print (e)`
			`return False`
			`def apply(self,sql):`
			`pass`
			`class Reader(Iceberg) :`
			`def __init__(self,**_args):`
			`super().__init__(**_args)`
			`def read(self,**_args):`
			`_table = self._table`
			`_prefix = self._getPrefix(**_args)`
			`if 'table' in _args or _table:`
			`_table = _args['table'] if 'table' in _args else _table`
			`_table = _prefix + f'.{_table}'`
			`return self._session.table(_table).toPandas()`
			`else:`
			`sql = _args['sql']`
			`return self._session.sql(sql).toPandas()`
			`pass`
			`class Writer (Iceberg):`
			`"""`
			`Writing data to an Apache Iceberg data warehouse (using pyspark)`
			`"""`
			`def __init__(self,**_args):`
			`super().__init__(**_args)`
bug fix: write when table doesn't exist 1 month ago			`_prefix = self._getPrefix(**_args)`