From e9aab3b034c214d2a7d5c8187cc722a110868cd4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 13 Sep 2024 10:15:47 -0500 Subject: [PATCH] bug fix, duckdb in-memory handling --- info/__init__.py | 2 +- notebooks/plugins.ipynb | 149 ++++++++++++++++++++++++++++++++++++++++ transport/duck.py | 19 +++++ 3 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 notebooks/plugins.ipynb create mode 100644 transport/duck.py diff --git a/info/__init__.py b/info/__init__.py index 04adfdf..a97b1ab 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,6 +1,6 @@ __app_name__ = 'data-transport' __author__ = 'The Phi Technology' -__version__= '2.2.2' +__version__= '2.2.4' __email__ = "info@the-phi.com" __license__=f""" Copyright 2010 - 2024, Steve L. Nyemba diff --git a/notebooks/plugins.ipynb b/notebooks/plugins.ipynb new file mode 100644 index 0000000..a5f7abb --- /dev/null +++ b/notebooks/plugins.ipynb @@ -0,0 +1,149 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing data-transport plugins\n", + "\n", + "The data-transport plugins are designed to automate pre/post processing i.e\n", + "\n", + " - Read -> Post processing\n", + " - Write-> Pre processing\n", + " \n", + "In this example we will assume, data and write both pre/post processing to any supported infrastructure. We will equally show how to specify the plugins within a configuration file" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# Writing to Google Bigquery database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "import os\n", + "import shutil\n", + "#\n", + "#\n", + "\n", + "DATABASE = '/home/steve/tmp/demo.db3'\n", + "if os.path.exists(DATABASE) :\n", + " os.remove(DATABASE)\n", + "#\n", + "# \n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "litew = transport.get.writer(provider=providers.SQLITE,database=DATABASE)\n", + "litew.write(_data,table='friends')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from SQLite\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age from a plugin function we will write. \n", + "\n", + "- Basic read of the designated table (friends) created above\n", + "- Read with pipeline functions defined in code\n", + "\n", + "**NOTE**\n", + "\n", + "It is possible to use **transport.factory.instance** or **transport.instance** or **transport.get.<[reader|writer]>** they are the same. It allows the maintainers to know that we used a factory design pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "2 Steve Nyemba 44\n", + "\n", + "\n", + " name age autoinc\n", + "0 James Bond 5.5 0\n", + "1 Steve Rogers 15.0 1\n", + "2 Steve Nyemba 4.4 2\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "import os\n", + "import numpy as np\n", + "def _autoincrement (_data,**kwargs) :\n", + " \"\"\"\n", + " This function will add an autoincrement field to the table\n", + " \"\"\"\n", + " _data['autoinc'] = np.arange(_data.shape[0])\n", + " \n", + " return _data\n", + "def reduce(_data,**_args) :\n", + " \"\"\"\n", + " This function will reduce the age of the data frame\n", + " \"\"\"\n", + " _data.age /= 10\n", + " return _data\n", + "reader = transport.get.reader(provider=providers.SQLITE,database=DATABASE,table='friends')\n", + "#\n", + "# basic read of the data created in the first cell\n", + "_df = reader.read()\n", + "print (_df)\n", + "print ()\n", + "print()\n", + "#\n", + "# read of the data with pipeline function provided to alter the database\n", + "print (reader.read(pipeline=[_autoincrement,reduce]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The parameters for instianciating a transport object (reader or writer) can be found at [data-transport home](https://healthcareio.the-phi.com/data-transport)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/transport/duck.py b/transport/duck.py new file mode 100644 index 0000000..7d580c9 --- /dev/null +++ b/transport/duck.py @@ -0,0 +1,19 @@ +""" +This file will be intended to handle duckdb database +""" + +import duckdb +from transport.common import Reader,Writer + +class Duck(Reader): + def __init__(self,**_args): + super().__init__(**_args) + self._path = None if 'path' not in _args else _args['path'] + self._handler = duckdb.connect() if not self._path else duckdb.connect(self._path) + + +class DuckReader(Duck) : + def __init__(self,**_args): + super().__init__(**_args) + def read(self,**_args) : + pass \ No newline at end of file