You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
data-transport/bin/transport

120 lines
3.6 KiB
Python

#!/usr/bin/env python
__doc__ = """
(c) 2018 - 2021 data-transport
steve@the-phi.com, The Phi Technology LLC
https://dev.the-phi.com/git/steve/data-transport.git
This program performs ETL between 9 supported data sources : Couchdb, Mongodb, Mysql, Mariadb, PostgreSQL, Netezza,Redshift, Sqlite, File
Usage :
transport --config <path-to-file.json> --procs <number-procs>
@TODO: Create tables if they don't exist for relational databases
"""
import pandas as pd
import numpy as np
import json
import sys
import transport
import time
from multiprocessing import Process
SYS_ARGS = {}
if len(sys.argv) > 1:
N = len(sys.argv)
for i in range(1,N):
value = None
if sys.argv[i].startswith('--'):
key = sys.argv[i][2:] #.replace('-','')
SYS_ARGS[key] = 1
if i + 1 < N:
value = sys.argv[i + 1] = sys.argv[i+1].strip()
if key and value and not value.startswith('--'):
SYS_ARGS[key] = value
i += 2
class Post(Process):
def __init__(self,**args):
super().__init__()
self.PROVIDER = args['target']['type']
self.writer = transport.factory.instance(**args['target'])
self.rows = args['rows']
def run(self):
_info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows
self.writer.write(_info)
self.writer.close()
class ETL (Process):
def __init__(self,**_args):
super().__init__()
self.name = _args['id']
self.reader = transport.factory.instance(**_args['source'])
self._oargs = _args['target'] #transport.factory.instance(**_args['target'])
self.JOB_COUNT = _args['jobs']
self.jobs = []
# self.logger = transport.factory.instance(**_args['logger'])
def log(self,**_args) :
_args['name'] = self.name
print (_args)
def run(self):
idf = self.reader.read()
idf = pd.DataFrame(idf)
idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()]
self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT)
#
# writing the data to a designated data source
#
try:
self.log(module='write',action='partitioning')
rows = np.array_split(np.arange(idf.shape[0]),self.JOB_COUNT)
for i in rows :
_id = 'segment #'.join([str(rows.index(i)),self.name])
segment = idf.loc[i,:] #.to_dict(orient='records')
proc = Post(target = self._oargs,rows = segment,name=_id)
self.jobs.append(proc)
proc.start()
self.log(module='write',action='working ...',name=self.name)
except Exception as e:
print (e)
def is_done(self):
self.jobs = [proc for proc in self.jobs if proc.is_alive()]
return len(self.jobs) == 0
def apply(_args) :
"""
This function will apply a set of commands against a data-store. The expected structure is as follows :
{"store":...,"apply":[]}
"""
handler = transport.factory.instance(**_args['store'])
for cmd in _args['apply'] :
handler.apply(cmd)
handler.close()
if __name__ == '__main__' :
_info = json.loads(open (SYS_ARGS['config']).read())
index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else None
procs = []
for _config in _info :
if 'source' in SYS_ARGS :
_config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}}
_config['jobs'] = 10 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs'])
etl = ETL (**_config)
etl.start()
procs.append(etl)
if index and _info.index(_config) == index :
break
#
#
N = len(procs)
while procs :
procs = [thread for thread in procs if not thread.is_done()]
if len(procs) < N :
print (["Finished ",(N-len(procs)), " remaining ", len(procs)])
N = len(procs)
time.sleep(1)
print ("We're done !!")