From ad3bc9046640907393eae8122ab5c0ed1ac48014 Mon Sep 17 00:00:00 2001 From: "Steve L. Nyemba" Date: Mon, 7 Aug 2017 15:06:12 +0000 Subject: [PATCH 001/271] Initial commit --- README.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..4ffe1ff --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# data-transport + +reusable data transport object \ No newline at end of file From b1f2d28795635bbfc67e9bab4ef4b8f86a67241a Mon Sep 17 00:00:00 2001 From: "Steve L. Nyemba" Date: Mon, 7 Aug 2017 15:21:00 +0000 Subject: [PATCH 002/271] Update 'README.md' initialization --- README.md | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4ffe1ff..242d21a 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,24 @@ -# data-transport +# Introduction -reusable data transport object \ No newline at end of file +This project implements an abstraction of objects that can have access to a variety of data stores, implementing read/write functions associated and specific to the data-sources. The classes implement functionalities against : + + - Rabbitmq-server + - Couchdb-server + - Http Session : {csv,tab,pipe,sql} + - Disk{Reader|Writer} : csv, tab, pipe, sql on disk + + +### Usage + +The basic usage revolves around a factory class (to be a singleton) + + import transport + + p = {"uri":"https://your-server:5984","dbname":"mydatabase","doc":"doc_id"} + couchdb = transport.Factory.instance(type='CouchdbReader',args=p) + + # + # let's execute a view + # + result = couchdb.view('view_name/function',key=value) + info = couchdb.read() \ No newline at end of file From 0e2d85e60882016a588b0b1d89eb6d58ebc326b7 Mon Sep 17 00:00:00 2001 From: "Steve L. Nyemba" Date: Mon, 25 Sep 2017 16:30:19 +0000 Subject: [PATCH 003/271] Upload files to '' transport file --- transport.py | 686 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 686 insertions(+) create mode 100644 transport.py diff --git a/transport.py b/transport.py new file mode 100644 index 0000000..8ed1370 --- /dev/null +++ b/transport.py @@ -0,0 +1,686 @@ +""" + This file implements data transport stuctures in order to allow data to be moved to and from anywhere + We can thus read data from disk and write to the cloud,queue, or couchdb or SQL +""" +from flask import request, session +import os +import pika +import json +import numpy as np +from couchdbkit import Server +import re +from csv import reader +from datetime import datetime +""" + @TODO: Write a process by which the class automatically handles reading and creating a preliminary sample and discovers the meta data +""" +class Reader: + def __init__(self): + self.nrows = 0 + self.xchar = None + + def row_count(self): + content = self.read() + return np.sum([1 for row in content]) + """ + This function determines the most common delimiter from a subset of possible delimiters. It uses a statistical approach to guage the distribution of columns for a given delimiter + """ + def delimiter(self,sample): + + m = {',':[],'\t':[],'|':[],'\x3A':[]} + delim = m.keys() + for row in sample: + for xchar in delim: + if row.split(xchar) > 1: + m[xchar].append(len(row.split(xchar))) + else: + m[xchar].append(0) + + + + # + # The delimiter with the smallest variance, provided the mean is greater than 1 + # This would be troublesome if there many broken records sampled + # + m = {id: np.var(m[id]) for id in m.keys() if m[id] != [] and int(np.mean(m[id]))>1} + index = m.values().index( min(m.values())) + xchar = m.keys()[index] + + return xchar + """ + This function determines the number of columns of a given sample + @pre self.xchar is not None + """ + def col_count(self,sample): + + m = {} + i = 0 + + for row in sample: + row = self.format(row) + id = str(len(row)) + #id = str(len(row.split(self.xchar))) + + if id not in m: + m[id] = 0 + m[id] = m[id] + 1 + + index = m.values().index( max(m.values()) ) + ncols = int(m.keys()[index]) + + + return ncols; + """ + This function will clean records of a given row by removing non-ascii characters + @pre self.xchar is not None + """ + def format (self,row): + + if isinstance(row,list) == False: + # + # We've observed sometimes fields contain delimiter as a legitimate character, we need to be able to account for this and not tamper with the field values (unless necessary) + cols = self.split(row) + #cols = row.split(self.xchar) + else: + cols = row ; + return [ re.sub('[^\x00-\x7F,\n,\r,\v,\b,]',' ',col.strip()).strip().replace('"','') for col in cols] + + #if isinstance(row,list) == False: + # return (self.xchar.join(r)).format('utf-8') + #else: + # return r + """ + This function performs a split of a record and tries to attempt to preserve the integrity of the data within i.e accounting for the double quotes. + @pre : self.xchar is not None + """ + def split (self,row): + + pattern = "".join(["(?:^|",self.xchar,")(\"(?:[^\"]+|\"\")*\"|[^",self.xchar,"]*)"]) + return re.findall(pattern,row.replace('\n','')) + +class Writer: + + def format(self,row,xchar): + if xchar is not None and isinstance(row,list): + return xchar.join(row)+'\n' + elif xchar is None and isinstance(row,dict): + row = json.dumps(row) + return row + """ + It is important to be able to archive data so as to insure that growth is controlled + Nothing in nature grows indefinitely neither should data being handled. + """ + def archive(self): + pass + def flush(self): + pass + +""" + This class is designed to read data from an Http request file handler provided to us by flask + The file will be heald in memory and processed accordingly + NOTE: This is inefficient and can crash a micro-instance (becareful) +""" +class HttpRequestReader(Reader): + def __init__(self,**params): + self.file_length = 0 + try: + + #self.file = params['file'] + #self.file.seek(0, os.SEEK_END) + #self.file_length = self.file.tell() + + #print 'size of file ',self.file_length + self.content = params['file'].readlines() + self.file_length = len(self.content) + except Exception, e: + print "Error ... ",e + pass + + def isready(self): + return self.file_length > 0 + def read(self,size =-1): + i = 1 + for row in self.content: + i += 1 + if size == i: + break + yield row + +""" + This class is designed to write data to a session/cookie +""" +class HttpSessionWriter(Writer): + """ + @param key required session key + """ + def __init__(self,**params): + self.session = params['queue'] + self.session['sql'] = [] + self.session['csv'] = [] + self.tablename = re.sub('..+$','',params['filename']) + self.session['uid'] = params['uid'] + #self.xchar = params['xchar'] + + + def format_sql(self,row): + values = "','".join([col.replace('"','').replace("'",'') for col in row]) + return "".join(["INSERT INTO :table VALUES('",values,"');\n"]).replace(':table',self.tablename) + def isready(self): + return True + def write(self,**params): + label = params['label'] + row = params ['row'] + + if label == 'usable': + self.session['csv'].append(self.format(row,',')) + self.session['sql'].append(self.format_sql(row)) + +""" + This class is designed to read data from disk (location on hard drive) + @pre : isready() == True +""" +class DiskReader(Reader) : + """ + @param path absolute path of the file to be read + """ + def __init__(self,**params): + Reader.__init__(self) + self.path = params['path'] ; + + def isready(self): + return os.path.exists(self.path) + """ + This function reads the rows from a designated location on disk + @param size number of rows to be read, -1 suggests all rows + """ + def read(self,size=-1): + f = open(self.path,'rU') + i = 1 + for row in f: + + i += 1 + if size == i: + break + yield row + f.close() +""" + This function writes output to disk in a designated location +""" +class DiskWriter(Writer): + def __init__(self,**params): + if 'path' in params: + self.path = params['path'] + else: + self.path = None + if 'name' in params: + self.name = params['name']; + else: + self.name = None + if os.path.exists(self.path) == False: + os.mkdir(self.path) + """ + This function determines if the class is ready for execution or not + i.e it determines if the preconditions of met prior execution + """ + def isready(self): + + p = self.path is not None and os.path.exists(self.path) + q = self.name is not None + return p and q + """ + This function writes a record to a designated file + @param label + @param row row to be written + """ + def write(self,**params): + label = params['label'] + row = params['row'] + xchar = None + if 'xchar' is not None: + xchar = params['xchar'] + path = ''.join([self.path,os.sep,label]) + if os.path.exists(path) == False: + os.mkdir(path) ; + path = ''.join([path,os.sep,self.name]) + f = open(path,'a') + row = self.format(row,xchar); + f.write(row) + f.close() +""" + This class hierarchy is designed to handle interactions with a queue server using pika framework (our tests are based on rabbitmq) +""" +class MessageQueue: + def __init__(self,**params): + self.host= params['host'] + self.uid = params['uid'] + self.qid = params['qid'] + + def isready(self): + #self.init() + resp = self.connection is not None and self.connection.is_open + self.close() + return resp + def close(self): + if self.connection.is_closed == False : + self.channel.close() + self.connection.close() +""" + This class is designed to publish content to an AMQP (Rabbitmq) + The class will rely on pika to implement this functionality + + We will publish information to a given queue for a given exchange +""" + +class QueueWriter(MessageQueue,Writer): + def __init__(self,**params): + #self.host= params['host'] + #self.uid = params['uid'] + #self.qid = params['queue'] + MessageQueue.__init__(self,**params); + + + def init(self,label=None): + properties = pika.ConnectionParameters(host=self.host) + self.connection = pika.BlockingConnection(properties) + self.channel = self.connection.channel() + self.info = self.channel.exchange_declare(exchange=self.uid,type='direct',durable=True) + if label is None: + self.qhandler = self.channel.queue_declare(queue=self.qid,durable=True) + else: + self.qhandler = self.channel.queue_declare(queue=label,durable=True) + + self.channel.queue_bind(exchange=self.uid,queue=self.qhandler.method.queue) + + + + """ + This function writes a stream of data to the a given queue + @param object object to be written (will be converted to JSON) + @TODO: make this less chatty + """ + def write(self,**params): + xchar = None + if 'xchar' in params: + xchar = params['xchar'] + object = self.format(params['row'],xchar) + + label = params['label'] + self.init(label) + _mode = 2 + if isinstance(object,str): + stream = object + _type = 'text/plain' + else: + stream = json.dumps(object) + if 'type' in params : + _type = params['type'] + else: + _type = 'application/json' + + self.channel.basic_publish( + exchange=self.uid, + routing_key=label, + body=stream, + properties=pika.BasicProperties(content_type=_type,delivery_mode=_mode) + ); + self.close() + + def flush(self,label): + self.init(label) + _mode = 1 #-- Non persistent + self.channel.queue_delete( queue=label); + self.close() + +""" + This class will read from a queue provided an exchange, queue and host + @TODO: Account for security and virtualhosts +""" +class QueueReader(MessageQueue,Reader): + """ + @param host host + @param uid exchange identifier + @param qid queue identifier + """ + def __init__(self,**params): + #self.host= params['host'] + #self.uid = params['uid'] + #self.qid = params['qid'] + MessageQueue.__init__(self,**params); + if 'durable' in params : + self.durable = True + else: + self.durable = False + self.size = -1 + self.data = {} + def init(self,qid): + + properties = pika.ConnectionParameters(host=self.host) + self.connection = pika.BlockingConnection(properties) + self.channel = self.connection.channel() + self.channel.exchange_declare(exchange=self.uid,type='direct',durable=True) + + self.info = self.channel.queue_declare(queue=qid,durable=True) + + + + """ + This is the callback function designed to process the data stream from the queue + + """ + def callback(self,channel,method,header,stream): + + r = [] + if re.match("^\{|\[",stream) is not None: + r = json.loads(stream) + else: + + r = stream + + qid = self.info.method.queue + if qid not in self.data : + self.data[qid] = [] + + self.data[qid].append(r) + # + # We stop reading when the all the messages of the queue are staked + # + if self.size == len(self.data[qid]) or len(self.data[qid]) == self.info.method.message_count: + self.close() + + """ + This function will read, the first message from a queue + @TODO: + Implement channel.basic_get in order to retrieve a single message at a time + Have the number of messages retrieved be specified by size (parameter) + """ + def read(self,size=-1): + r = {} + self.size = size + # + # We enabled the reader to be able to read from several queues (sequentially for now) + # The qid parameter will be an array of queues the reader will be reading from + # + if isinstance(self.qid,basestring) : + self.qid = [self.qid] + for qid in self.qid: + self.init(qid) + # r[qid] = [] + + if self.info.method.message_count > 0: + + self.channel.basic_consume(self.callback,queue=qid,no_ack=False); + self.channel.start_consuming() + else: + + pass + #self.close() + # r[qid].append( self.data) + + return self.data +class QueueListener(QueueReader): + def init(self,qid): + properties = pika.ConnectionParameters(host=self.host) + self.connection = pika.BlockingConnection(properties) + self.channel = self.connection.channel() + self.channel.exchange_declare(exchange=self.uid,type='direct',durable=True ) + + self.info = self.channel.queue_declare(passive=True,exclusive=True,queue=qid) + + self.channel.queue_bind(exchange=self.uid,queue=self.info.method.queue,routing_key=qid) + #self.callback = callback + def read(self): + + self.init(self.qid) + self.channel.basic_consume(self.callback,queue=self.qid,no_ack=True); + self.channel.start_consuming() + +""" + This class is designed to write output as sql insert statements + The class will inherit from DiskWriter with minor adjustments + @TODO: Include script to create the table if need be using the upper bound of a learner +""" +class SQLDiskWriter(DiskWriter): + def __init__(self,**args): + DiskWriter.__init__(self,**args) + self.tablename = re.sub('\..+$','',self.name).replace(' ','_') + """ + @param label + @param row + @param xchar + """ + def write(self,**args): + label = args['label'] + row = args['row'] + + if label == 'usable': + values = "','".join([col.replace('"','').replace("'",'') for col in row]) + row = "".join(["INSERT INTO :table VALUES('",values,"');\n"]).replace(':table',self.tablename) + + args['row'] = row + DiskWriter.write(self,**args) +class Couchdb: + """ + @param uri host & port reference + @param uid user id involved + + @param dbname database name (target) + """ + def __init__(self,**args): + uri = args['uri'] + self.uid = args['uid'] + dbname = args['dbname'] + self.server = Server(uri=uri) + self.dbase = self.server.get_db(dbname) + if self.dbase.doc_exist(self.uid) == False: + self.dbase.save_doc({"_id":self.uid}) + """ + Insuring the preconditions are met for processing + """ + def isready(self): + p = self.server.info() != {} + if p == False or self.dbase.dbname not in self.server.all_dbs(): + return False + # + # At this point we are sure that the server is connected + # We are also sure that the database actually exists + # + q = self.dbase.doc_exist(self.uid) + if q == False: + return False + return True + def view(self,id,**args): + r =self.dbase.view(id,**args) + r = r.all() + return r[0]['value'] if len(r) > 0 else [] + +""" + This function will read an attachment from couchdb and return it to calling code. The attachment must have been placed before hand (otherwise oops) + @T: Account for security & access control +""" +class CouchdbReader(Couchdb,Reader): + """ + @param filename filename (attachment) + """ + def __init__(self,**args): + # + # setting the basic parameters for + Couchdb.__init__(self,**args) + if 'filename' in args : + self.filename = args['filename'] + else: + self.filename = None + + def isready(self): + # + # Is the basic information about the database valid + # + p = Couchdb.isready(self) + + if p == False: + return False + # + # The database name is set and correct at this point + # We insure the document of the given user has the requested attachment. + # + + doc = self.dbase.get(self.uid) + + if '_attachments' in doc: + r = self.filename in doc['_attachments'].keys() + + else: + r = False + + return r + def stream(self): + content = self.dbase.fetch_attachment(self.uid,self.filename).split('\n') ; + i = 1 + for row in content: + yield row + if size > 0 and i == size: + break + i = i + 1 + + def read(self,size=-1): + if self.filename is not None: + self.stream() + else: + return self.basic_read() + def basic_read(self): + document = self.dbase.get(self.uid) + del document['_id'], document['_rev'] + return document +""" + This class will write on a couchdb document provided a scope + The scope is the attribute that will be on the couchdb document +""" +class CouchdbWriter(Couchdb,Writer): + """ + @param uri host & port reference + @param uid user id involved + @param filename filename (attachment) + @param dbname database name (target) + """ + def __init__(self,**args): + + Couchdb.__init__(self,**args) + uri = args['uri'] + self.uid = args['uid'] + if 'filename' in args: + self.filename = args['filename'] + else: + self.filename = None + dbname = args['dbname'] + self.server = Server(uri=uri) + self.dbase = self.server.get_db(dbname) + # + # If the document doesn't exist then we should create it + # + + """ + write a given attribute to a document database + @param label scope of the row repair|broken|fixed|stats + @param row row to be written + """ + def write(self,**params): + + document = self.dbase.get(self.uid) + label = params['label'] + row = params['row'] + if label not in document : + document[label] = [] + document[label].append(row) + self.dbase.save_doc(document) + def flush(self,**params) : + + size = params['size'] if 'size' in params else 0 + has_changed = False + document = self.dbase.get(self.uid) + for key in document: + if key not in ['_id','_rev','_attachments'] : + content = document[key] + else: + continue + if isinstance(content,list) and size > 0: + index = len(content) - size + content = content[index:] + document[key] = content + + else: + document[key] = {} + has_changed = True + + self.dbase.save_doc(document) + + def archive(self,params=None): + document = self.dbase.get(self.uid) + content = {} + _doc = {} + for id in document: + if id in ['_id','_rev','_attachments'] : + _doc[id] = document[id] + else: + content[id] = document[id] + + content = json.dumps(content) + document= _doc + now = str(datetime.today()) + + name = '-'.join([document['_id'] , now,'.json']) + self.dbase.save_doc(document) + self.dbase.put_attachment(document,content,name,'application/json') +""" + This class acts as a factory to be able to generate an instance of a Reader/Writer + Against a Queue,Disk,Cloud,Couchdb + The class doesn't enforce parameter validation, thus any error with the parameters sent will result in a null Object +""" +class DataSourceFactory: + def instance(self,**args): + source = args['type'] + params = args['args'] + anObject = None + + if source in ['HttpRequestReader','HttpSessionWriter']: + # + # @TODO: Make sure objects are serializable, be smart about them !! + # + aClassName = ''.join([source,'(**params)']) + + + else: + + stream = json.dumps(params) + aClassName = ''.join([source,'(**',stream,')']) + try: + + + anObject = eval( aClassName) + #setattr(anObject,'name',source) + except Exception,e: + print ['Error ',e] + return anObject +""" + This class implements a data-source handler that is intended to be used within the context of data processing, it allows to read/write anywhere transparently. + The class is a facade to a heterogeneous class hierarchy and thus simplifies how the calling code interacts with the class hierarchy +""" +class DataSource: + def __init__(self,sourceType='Disk',outputType='Disk',params={}): + self.Input = DataSourceFactory.instance(type=sourceType,args=params) + self.Output= DataSourceFactory.instance(type=outputType,args=params) + def read(self,size=-1): + return self.Input.read(size) + def write(self,**args): + self.Output.write(**args) +#p = {} +#p['host'] = 'dev.the-phi.com' +#p['uid'] = 'nyemba@gmail.com' +#p['qid'] = 'repair' +#factory = DataSourceFactory() +#o = factory.instance(type='QueueReader',args=p) +#print o is None +#q = QueueWriter(host='dev.the-phi.com',uid='nyemba@gmail.com') +#q.write(object='steve') +#q.write(object='nyemba') +#q.write(object='elon') + + From 3e77175d06fc4630965f4206dd25aa98b281f4d3 Mon Sep 17 00:00:00 2001 From: "Steve L. Nyemba" Date: Mon, 25 Sep 2017 21:55:34 -0500 Subject: [PATCH 004/271] TR - added s3 handling (reader) --- transport.py | 96 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 83 insertions(+), 13 deletions(-) diff --git a/transport.py b/transport.py index 8ed1370..ea13bff 100644 --- a/transport.py +++ b/transport.py @@ -11,6 +11,9 @@ from couchdbkit import Server import re from csv import reader from datetime import datetime +import boto +import botocore +from smart_open import smart_open """ @TODO: Write a process by which the class automatically handles reading and creating a preliminary sample and discovers the meta data """ @@ -629,12 +632,81 @@ class CouchdbWriter(Couchdb,Writer): name = '-'.join([document['_id'] , now,'.json']) self.dbase.save_doc(document) self.dbase.put_attachment(document,content,name,'application/json') +class s3 : + """ + @TODO: Implement a search function for a file given a bucket?? + """ + def __init__(self,args) : + """ + This function will extract a file or set of files from s3 bucket provided + @param access_key + @param secret_key + @param path location of the file + @param filter filename or filtering elements + """ + try: + self.s3 = boto.connect_s3(args['access_key'],args['secret_key']) + self.bucket = self.s3.get_bucket(args['bucket']) if 'bucket' in args else None + # self.path = args['path'] + self.filter = args['filter'] if 'filter' in args else None + self.filename = args['file'] if 'file' in args else None + + except Exception as e : + self.s3 = None + self.bucket = None + print e + def buckets(self): + """ + This function is a wrapper around the bucket list of buckets for s3 + + """ + return self.s3.get_all_buckets() + + +class s3Reader(s3,Reader) : + """ + Because s3 contains buckets and files, reading becomes a tricky proposition : + - list files if file is None + - stream content if file is Not None + @TODO: support read from all buckets, think about it + """ + def __init__(self,args) : + s3.__init__(self,args) + def files(self): + r = [] + try: + return [item.name for item in self.bucket if item.size > 0] + except Exception as e: + pass + return r + def stream(self,limit=-1): + """ + At this point we should stream a file from a given bucket + """ + key = self.bucket.get_key(self.filename.strip()) + if key is None : + yield None + else: + count = 0 + with smart_open(key) as remote_file: + for line in remote_file: + if count == limit and limit > 0 : + break + yield line + count += 1 + def read(self,limit=-1) : + if self.filename is None : + # + # returning the list of files because no one file was specified. + return self.files() + else: + return self.stream(10) """ This class acts as a factory to be able to generate an instance of a Reader/Writer Against a Queue,Disk,Cloud,Couchdb The class doesn't enforce parameter validation, thus any error with the parameters sent will result in a null Object """ -class DataSourceFactory: +class Factory: def instance(self,**args): source = args['type'] params = args['args'] @@ -659,6 +731,10 @@ class DataSourceFactory: except Exception,e: print ['Error ',e] return anObject +class s3Writer(s3,Writer) : + def __init__(self,args) : + s3.__init__(self,args) + """ This class implements a data-source handler that is intended to be used within the context of data processing, it allows to read/write anywhere transparently. The class is a facade to a heterogeneous class hierarchy and thus simplifies how the calling code interacts with the class hierarchy @@ -671,16 +747,10 @@ class DataSource: return self.Input.read(size) def write(self,**args): self.Output.write(**args) -#p = {} -#p['host'] = 'dev.the-phi.com' -#p['uid'] = 'nyemba@gmail.com' -#p['qid'] = 'repair' -#factory = DataSourceFactory() -#o = factory.instance(type='QueueReader',args=p) -#print o is None -#q = QueueWriter(host='dev.the-phi.com',uid='nyemba@gmail.com') -#q.write(object='steve') -#q.write(object='nyemba') -#q.write(object='elon') - +# conf = json.loads(open('conf.json').read()) +# x = s3Reader( dict(conf,**{'bucket':'com.phi.sample.data','file':'Sample-Spreadsheet-5000-rows.csv'})) +# r = x.read() +# for item in r : +# print item +#print buckets[1].get_key('Sample-Spreadsheet-5000-rows.csv') From cf6f017b91f6db995f124d7b8e1e9c0ccace23e9 Mon Sep 17 00:00:00 2001 From: "Steve L. Nyemba" Date: Tue, 26 Sep 2017 15:54:26 -0500 Subject: [PATCH 005/271] S3 - Bug fix? Not sure --- transport.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) mode change 100644 => 100755 transport.py diff --git a/transport.py b/transport.py old mode 100644 new mode 100755 index ea13bff..c6bd085 --- a/transport.py +++ b/transport.py @@ -646,7 +646,7 @@ class s3 : """ try: self.s3 = boto.connect_s3(args['access_key'],args['secret_key']) - self.bucket = self.s3.get_bucket(args['bucket']) if 'bucket' in args else None + self.bucket = self.s3.get_bucket(args['bucket'].strip(),validate=False) if 'bucket' in args else None # self.path = args['path'] self.filter = args['filter'] if 'filter' in args else None self.filename = args['file'] if 'file' in args else None @@ -747,9 +747,11 @@ class DataSource: return self.Input.read(size) def write(self,**args): self.Output.write(**args) -# conf = json.loads(open('conf.json').read()) -# x = s3Reader( dict(conf,**{'bucket':'com.phi.sample.data','file':'Sample-Spreadsheet-5000-rows.csv'})) - +conf = json.loads(open('config.json').read()) +#x = s3Reader( dict(conf,**{'bucket':'com.phi.sample.data','file':'Sample-Spreadsheet-5000-rows.csv'})) +x = s3Reader(conf) +print conf +print x.bucket.get_all_keys() # r = x.read() # for item in r : # print item From 8d4ecd7a9f31fa72a08fbb68f28cdfee7bd8ced2 Mon Sep 17 00:00:00 2001 From: "Steve L. Nyemba" Date: Tue, 26 Sep 2017 16:10:14 -0500 Subject: [PATCH 006/271] S3 Requirments file --- requirements.txt | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4e72ea4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,35 @@ +asn1crypto==0.23.0 +boto==2.48.0 +boto3==1.4.7 +botocore==1.7.17 +bz2file==0.98 +certifi==2017.7.27.1 +cffi==1.11.0 +chardet==3.0.4 +click==6.7 +couchdbkit==0.6.5 +cryptography==2.0.3 +docutils==0.14 +enum34==1.1.6 +Flask==0.12.2 +futures==3.1.1 +http-parser==0.8.3 +idna==2.6 +ipaddress==1.0.18 +itsdangerous==0.24 +Jinja2==2.9.6 +jmespath==0.9.3 +MarkupSafe==1.0 +numpy==1.13.1 +pika==0.11.0 +pycparser==2.18 +pyOpenSSL==17.3.0 +python-dateutil==2.6.1 +requests==2.18.4 +restkit==4.2.2 +s3transfer==0.1.11 +six==1.11.0 +smart-open==1.5.3 +socketpool==0.5.3 +urllib3==1.22 +Werkzeug==0.12.2 From 0b15351b8efc9612e1897086469c090214c8effa Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 16 Sep 2019 23:08:43 -0500 Subject: [PATCH 007/271] data transport framework for rabbitmq, mongodb, couchdb, ... --- setup.py | 19 ++++ transport/__init__.py | 209 +++++++++++++++++++++++++++++++++++++++++ transport/__init__.pyc | Bin 0 -> 2005 bytes transport/common.py | 154 ++++++++++++++++++++++++++++++ transport/common.pyc | Bin 0 -> 4344 bytes transport/couch.py | 199 +++++++++++++++++++++++++++++++++++++++ transport/couch.pyc | Bin 0 -> 5405 bytes transport/couchdb.pyc | Bin 0 -> 5447 bytes transport/disk.py | 82 ++++++++++++++++ transport/disk.pyc | Bin 0 -> 2855 bytes transport/mongo.py | 66 +++++++++++++ transport/mongo.pyc | Bin 0 -> 3162 bytes transport/queue.py | 200 +++++++++++++++++++++++++++++++++++++++ transport/queue.pyc | Bin 0 -> 6988 bytes transport/s3.py | 83 ++++++++++++++++ transport/s3.pyc | Bin 0 -> 3141 bytes transport/session.py | 66 +++++++++++++ 17 files changed, 1078 insertions(+) create mode 100644 setup.py create mode 100644 transport/__init__.py create mode 100644 transport/__init__.pyc create mode 100644 transport/common.py create mode 100644 transport/common.pyc create mode 100644 transport/couch.py create mode 100644 transport/couch.pyc create mode 100644 transport/couchdb.pyc create mode 100644 transport/disk.py create mode 100644 transport/disk.pyc create mode 100644 transport/mongo.py create mode 100644 transport/mongo.pyc create mode 100644 transport/queue.py create mode 100644 transport/queue.pyc create mode 100644 transport/s3.py create mode 100644 transport/s3.pyc create mode 100644 transport/session.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..acd5d78 --- /dev/null +++ b/setup.py @@ -0,0 +1,19 @@ +""" +This is a build file for the +""" +from setuptools import setup, find_packages + +setup( + name = "data-transport", + version = "1.0", + author = "The Phi Technology LLC", + author_email = "steve@the-phi.com", + license = "MIT", + packages=['transport'], + install_requires = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open'], + + use_2to3=True, + convert_2to3_doctests=['src/your/module/README.txt'], + use_2to3_fixers=['your.fixers'], + use_2to3_exclude_fixers=['lib2to3.fixes.fix_import'], + ) diff --git a/transport/__init__.py b/transport/__init__.py new file mode 100644 index 0000000..9b4f540 --- /dev/null +++ b/transport/__init__.py @@ -0,0 +1,209 @@ +""" +Data Transport - 1.0 +Steve L. Nyemba, The Phi Technology LLC + +This module is designed to serve as a wrapper to a set of supported data stores : + - couchdb + - mongodb + - Files (character delimited) + - Queues (RabbmitMq) + - Session (Flask) + - s3 +The supported operations are read/write and providing meta data to the calling code +Requirements : + pymongo + boto + couldant +The configuration for the data-store is as follows : + couchdb: + { + args:{ + url:, + username:, + password:, + dbname:, + uid: + } + } + RabbitMQ: + { + + } + Mongodb: + { + args:{ + host:, #localhost:27017 + username:, + password:, + dbname:, + uid:s + + } + } +""" +__author__ = 'The Phi Technology' +import numpy as np +import json +import importlib +from common import Reader, Writer #, factory +# import disk +# import queue +# import couch +# import mongo +# import s3 +class factory : + @staticmethod + def instance(**args): + """ + This class will create an instance of a transport when providing + :type name of the type we are trying to create + :args The arguments needed to create the instance + """ + source = args['type'] + params = args['args'] + anObject = None + + if source in ['HttpRequestReader','HttpSessionWriter']: + # + # @TODO: Make sure objects are serializable, be smart about them !! + # + aClassName = ''.join([source,'(**params)']) + + + else: + + stream = json.dumps(params) + aClassName = ''.join([source,'(**',stream,')']) + try: + anObject = eval( aClassName) + #setattr(anObject,'name',source) + except Exception,e: + print ['Error ',e] + return anObject + +# class Reader: +# def __init__(self): +# self.nrows = 0 +# self.xchar = None + +# def row_count(self): +# content = self.read() +# return np.sum([1 for row in content]) +# def delimiter(self,sample): +# """ +# This function determines the most common delimiter from a subset of possible delimiters. +# It uses a statistical approach (distribution) to guage the distribution of columns for a given delimiter + +# :sample sample string/content expecting matrix i.e list of rows +# """ + +# m = {',':[],'\t':[],'|':[],'\x3A':[]} +# delim = m.keys() +# for row in sample: +# for xchar in delim: +# if row.split(xchar) > 1: +# m[xchar].append(len(row.split(xchar))) +# else: +# m[xchar].append(0) + + + +# # +# # The delimiter with the smallest variance, provided the mean is greater than 1 +# # This would be troublesome if there many broken records sampled +# # +# m = {id: np.var(m[id]) for id in m.keys() if m[id] != [] and int(np.mean(m[id]))>1} +# index = m.values().index( min(m.values())) +# xchar = m.keys()[index] + +# return xchar +# def col_count(self,sample): +# """ +# This function retirms the number of columns of a given sample +# @pre self.xchar is not None +# """ + +# m = {} +# i = 0 + +# for row in sample: +# row = self.format(row) +# id = str(len(row)) +# #id = str(len(row.split(self.xchar))) + +# if id not in m: +# m[id] = 0 +# m[id] = m[id] + 1 + +# index = m.values().index( max(m.values()) ) +# ncols = int(m.keys()[index]) + + +# return ncols; +# def format (self,row): +# """ +# This function will clean records of a given row by removing non-ascii characters +# @pre self.xchar is not None +# """ + +# if isinstance(row,list) == False: +# # +# # We've observed sometimes fields contain delimiter as a legitimate character, we need to be able to account for this and not tamper with the field values (unless necessary) +# cols = self.split(row) +# #cols = row.split(self.xchar) +# else: +# cols = row ; +# return [ re.sub('[^\x00-\x7F,\n,\r,\v,\b,]',' ',col.strip()).strip().replace('"','') for col in cols] + +# def split (self,row): +# """ +# This function performs a split of a record and tries to attempt to preserve the integrity of the data within i.e accounting for the double quotes. +# @pre : self.xchar is not None +# """ + +# pattern = "".join(["(?:^|",self.xchar,")(\"(?:[^\"]+|\"\")*\"|[^",self.xchar,"]*)"]) +# return re.findall(pattern,row.replace('\n','')) + + +# class Writer: + +# def format(self,row,xchar): +# if xchar is not None and isinstance(row,list): +# return xchar.join(row)+'\n' +# elif xchar is None and isinstance(row,dict): +# row = json.dumps(row) +# return row +# """ +# It is important to be able to archive data so as to insure that growth is controlled +# Nothing in nature grows indefinitely neither should data being handled. +# """ +# def archive(self): +# pass +# def flush(self): +# pass + +# class factory : +# @staticmethod +# def instance(**args): + +# source = args['type'] +# params = args['args'] +# anObject = None + +# if source in ['HttpRequestReader','HttpSessionWriter']: +# # +# # @TODO: Make sure objects are serializable, be smart about them !! +# # +# aClassName = ''.join([source,'(**params)']) + + +# else: + +# stream = json.dumps(params) +# aClassName = ''.join([source,'(**',stream,')']) +# try: +# anObject = eval( aClassName) +# #setattr(anObject,'name',source) +# except Exception,e: +# print ['Error ',e] +# return anObject \ No newline at end of file diff --git a/transport/__init__.pyc b/transport/__init__.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0311a202464c6bf90217bb05bf7320139c170dbd GIT binary patch literal 2005 zcma)7QEwYX5T3IgJH7^`AOWqA&`P{Gp-Eeih!jN;qO?>Z4b+L0VyQZ6=&cemHw zy%-ByLMlIk7ybeNju+nGo4s>d1thp=H?y;|GvCh4w{HLOPFSA({4%EIYvccy81^Nm zglL30piw|ci;^~FEgH2~a)(BpmApoyH7|E4>r&pMQIGO<8m&{1V53J#mwq97P4vU4 z&z?>O>m26K7E$=MbW)reS=dq=Cw9cc-3Q@`Q&&nH?TX{M%E$7aIGrl-{WKM)DxMZP z)04S4I(ig_r_(!GqgXJKeN+V+jw~5NqJcX;R#{PMwDmEF)@faJ= zUadAxl(nfY#NbIL?d3{qKgT`PTZlBkN(V!LH%b^KllwD+2tpQ#D2={KleCzKTmiX< zkDEETh-H@XMywMR4%MqFH7ZwyKCYN0vT#0jtc&w>Qq@t#xkj$w zj$QBg%y4p%%5$A%dgi?~8G_&bPZ)h^CU)OT{mNwf2bd1;@kYNwB?_4<=o+cnD5bSC zZIbnx3mZ5zQ4ESePr+?u)9)K9+x|oJ2 zy)s@r;|4IU2kklfPMdiQ(SL0I6^8u~lLB!AQlKn()`7uo8lDUS?D-SyVGZM34Er0V zh3EO*W#FY2-Cg2vNXs@Y!2Mfv*(GyGpz;MQpy|@2MXv)|cBt4U*P;tp)*8_L+N?(l zEZ1q$X^iK$$#rN^FWYohd`um;MwcO(Uny92@m%z2;k|$Jvc}s4CDgWoS8F$ zz^B?bAKL;$3MoZU{_XGCRV-$YbcH%Z&`ddzEH7qj8Z=0k{lzY z-po3bCHFwp0Ndmz`#z-W!rYIdv`AeP?Ur-zaFx70SlK&#suu=f&~0sW?yPMF9|i5; zc55@(3)wkgw^v z7e#o1u4Bpy0SuO$)B;JJgAS0&;VnMEc};WvoypR%&rPgzyuw51X~>oI(f&6!pQ(fT UqKBX2xaGTcgKn_l|32vc4Nc 1: + m[xchar].append(len(row.split(xchar))) + else: + m[xchar].append(0) + + + + # + # The delimiter with the smallest variance, provided the mean is greater than 1 + # This would be troublesome if there many broken records sampled + # + m = {id: np.var(m[id]) for id in m.keys() if m[id] != [] and int(np.mean(m[id]))>1} + index = m.values().index( min(m.values())) + xchar = m.keys()[index] + + return xchar + def col_count(self,sample): + """ + This function retirms the number of columns of a given sample + @pre self.xchar is not None + """ + + m = {} + i = 0 + + for row in sample: + row = self.format(row) + id = str(len(row)) + #id = str(len(row.split(self.xchar))) + + if id not in m: + m[id] = 0 + m[id] = m[id] + 1 + + index = m.values().index( max(m.values()) ) + ncols = int(m.keys()[index]) + + + return ncols; + def format (self,row): + """ + This function will clean records of a given row by removing non-ascii characters + @pre self.xchar is not None + """ + + if isinstance(row,list) == False: + # + # We've observed sometimes fields contain delimiter as a legitimate character, we need to be able to account for this and not tamper with the field values (unless necessary) + cols = self.split(row) + #cols = row.split(self.xchar) + else: + cols = row ; + return [ re.sub('[^\x00-\x7F,\n,\r,\v,\b,]',' ',col.strip()).strip().replace('"','') for col in cols] + + def split (self,row): + """ + This function performs a split of a record and tries to attempt to preserve the integrity of the data within i.e accounting for the double quotes. + @pre : self.xchar is not None + """ + + pattern = "".join(["(?:^|",self.xchar,")(\"(?:[^\"]+|\"\")*\"|[^",self.xchar,"]*)"]) + return re.findall(pattern,row.replace('\n','')) + + +class Writer: + + def format(self,row,xchar): + if xchar is not None and isinstance(row,list): + return xchar.join(row)+'\n' + elif xchar is None and isinstance(row,dict): + row = json.dumps(row) + return row + """ + It is important to be able to archive data so as to insure that growth is controlled + Nothing in nature grows indefinitely neither should data being handled. + """ + def archive(self): + pass + def flush(self): + pass + +# class factory : +# @staticmethod +# def instance(**args): +# """ +# This class will create an instance of a transport when providing +# :type name of the type we are trying to create +# :args The arguments needed to create the instance +# """ +# source = args['type'] +# params = args['args'] +# anObject = None + +# if source in ['HttpRequestReader','HttpSessionWriter']: +# # +# # @TODO: Make sure objects are serializable, be smart about them !! +# # +# aClassName = ''.join([source,'(**params)']) + + +# else: + +# stream = json.dumps(params) +# aClassName = ''.join([source,'(**',stream,')']) +# try: +# anObject = eval( aClassName) +# #setattr(anObject,'name',source) +# except Exception,e: +# print ['Error ',e] +# return anObject \ No newline at end of file diff --git a/transport/common.pyc b/transport/common.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91a168c9e500c597f44fd70800f6455024b930e2 GIT binary patch literal 4344 zcmcInYi}G$6|J79XEq^_Y!b{asN+TKIN08x>@Hx15Ury~h+v>?u-1`1^mLa!ZTHJv zHGYV(SSc$WpZN!fU%^k}D?b3vxivE$cCoTw7`yM(<95}pI``arr~a>--M{_zv%N@7 zK6QLQ!((pw)&IfTqwbc~r`mtH=5Jo;Y!I&a+e7XD zYMA)jIvVChS{z>ZFJEqSyW7LW_*oH;QthJ^Ym*%2I`(bh8(m?I(Dq9tNiQq%!(!HZk)+VTPBa3BYohDg= zrIsi9zaHrkRf2FZz{D?3rW0G*n54-4-itIe$J0^s80XM)78Niyv`~ccmG&zg#vhzj z*uf9;*e|Q%G>Mb^(9bk17xQs4izT8kO*s)2vF--?WRz4o)4AorI^~6&quUu2w&-?n z#54|b+wGd$K;z3doctS~P20j`01I>#p(P*Srar-AoTWA}usl1LS zCUQRxh>0s`z^o(j^-E7#PaW0NAqGA5eD8+3l)Koksgp(Z+M5|pi=5$HiO^b}C@(En zH=|4@15VmDL{#J!2+Ax@`4!I4!CTPm1K`|#hl{#H-0&jq#0%#m+x-)bq(Q53+aL3* z=DzCgtD!#ri&B4{9kZs+hwgyCQHtf%)gR|fcn*z;N!SU7H5FWx7E0GGd)d6Q zdE8Wsw5|aN7I0kiHX!JBIvt4}2ctY9(?DWSIZJZzmQ2bDV?iVO#f;yTe^3>fAA+fa zNfapy5PE>%GmQb2S@XI0&#XT(SPRM)R5msNH+_VmstBW@--|I+C4&+7US?oA9EFEk zww|A2q>74klpz!x6cv^`OiuNjN!*Lix(Txqc1*soYlT1n&ld0T^?9jrPR5K7Gv|J? zru`H)a08MDYruh(S}~NaJ(l)*!Vw`{#Mab@NFnnZHw7N-{ak?rgxgb#aHQk9s#?mz zi-i}H2?tM%X?O>wF;}0yggLzXR}iKJsAN-$o=~CAjdB~b@hybTG-4M=PeF6qBs5xk zBwtDFn9ujK9)!A%@rIe`Nt{F$V3$vSL{+ZVQN7t#$NIuZOfhAeSP6*CCwVOUr(h@f zbQ-1z`l6TQu|AhcIFv*<$7iZJfJ9AvCZGWU4-zawh_PaPAQyaHpd!gouYM0(5wj8`S+f1Q2y0iag$Rqz;6~!gj}F5C<6HPF)ue zfm&b`SbyAB)s_Tl%r@1N9MDT^5i#VxYd$%-qjt~)$ghQ=IQOjUeBlya(2~J6b;d2V z$X(iM+=dtMUQ7r9|NL*hR@x?2=E$0lvH|k&99PhsQpH_3-hWmimog8mNp4`hkQdfx z#=xvNxVXAR34s)h16>42@F}BX2{ zO)<}X2*7wUa|(eg{Q_cVNt*gm3f`ikiHa&VL~IHZf{;JBz)Xhf25~Ac^3~8pN#f56 zEe5GKplEJk>sR|~^$#oEm76PXtt_nU;owU7BA0L#6Pzr0mzX3+e9j{+F@*t9nE4`1 zjSlWYL&6Dst&TRrzRWVVmXe$+U8Z5AB}6&kxvH#1bmc8H-i&g+?AdfehK!GC62IojZD5C@u5CIeZUCCwsft9|D5eiQ_ zz2q$hJBN^iONvh%of#5^w6!`bEt}w#E4G<95v27YYSjzA$^ofQ&k{RKa>+VjB!vkv zW=0d0V#FMJGAgV#YcrJBuV-uUJ(zFkY`veZ?~e`Xx7=Go=hgnw-uK5#OUnGuPkBzTMB+Z69n&?UqNueOxdb*U_XzG7rDHy~PGr_rf* z-tFA)+>&6n!ygd;qlV{R`b{-F3UyvdL@|9f&^CJzaI+X0y`!oXMj6H$@icT@uy;`g zetUmwqPw7~Pp7&PKiF+Dz&Kr$63i#FvU!3ZaR`{6gZC+T9l;|A!IVkdfoo9p@J8cN zR+=whDX3rF=(U&0+b{4)fuOv1yl>#S?S2O^pzx6!V7SHcUB$BLR_3I*&zmIOnAUzsAuji0w$>r8jF;B6SM?EeGdzZ2s#QZh zTAO=y0fTbLem^dvnd^rmJ1i=6X%`ZBc_Fo3@I$II=Or0`pQp({{5s$!bE~3veahsP VPaL9blh|_Mrf%!nfny5|g1AD$6>3yKx}eCBmaAQI z&6(9lA-~jlOt5hC{*I=-5!?9oRvG7JQk1sc zv-&`XPuIiEV?Ex9Z-rX}9ezH@!Y!Q)@?uo%9k<(CgUp1x*+_@D$KhcaPbRtyzeAk zz!B;k*jitLCl9$Dglwc|fCK3S^_x>n=exo3@4)R&n<%6(Beb+ZKml2kKWICb2Px zMVZRvewKDGVMatV%WyJ7B2F5dU;FrB5Kt#Yab zfhkQZV$&j-j&*M3VWWB=cJ)zatg+lhQtH_1{(LTFnfO49(9I&(U7|srj2`Vu2Y@jV z&mYMcdsyGaO-W;}qGv1by(6x-J~@_s^!r(!*?yl>%o>+?ZEpqFP4A}vj<@39@K)=q z-mR-^Y zIGDH`PY7ple!DoR^QH_giNBhUOf4BW4i7SY$f5h7tyG(A zFApmehJE+q&JliLcTyGyS*pX`>1gCg>oBu}YEn1>7iLEwFvz0NzR;YITd@mf*Yc?_ zTi2n?+1JMbF0%WMMnKRhyNDbhahVQZv+x^C27X|S6chpd5)FHZCX@aM;b=fK0x#t8Fq6R0$s8^?k~hP$3YD zZeq1SQJ6!bhB+T>L=Xl23U@nzM0XCh7lMxb&6#d--{xG1dZUKyzhZ!PrQA`Yh z{AlZb_;CW_- z_sUO*R~inYpiw-~gaT)P`eEK=4&a`9vgJ+LE{<@}C83;YMHi@fBw3JViCr97nDpG^ zM&zJB;hF_$hUYdg+V-2CBYQehbeqjO8{tRlI{F10E?;~^dS`jU9ZY?S%aDDWb&%vw zg7$J_L6fcsNiZruD*G`mapiMp(p4XzNnuCkbOzdIs3eN0(JGOzBe(0=n8VF?$jzDF z{L^i_;w^CI*;Uhg(re2mBonjJtD?1_>by#R)x`7atH4!{67K{Fr{Kouy+FcQARv7H zhRe{pswG}UbJ6RB`u~B>nO^1cUGXZg%zjIiPlYH@M=e!V5TM|>DhQ!Lq6eY_t33D} zlEw~;C_Et3M!k2e_9O+wn^`i5UI!U9>MS^^$Luf=$3+`H`R zvq#zI8d~-u8u}Ne3mQ~w)w}Aw4SFHLtROp+K)^V~jX&YMZ_s=n>w9F(nmW4cg5dkJ zk{4+^ttxL}LDm4cJ%AN)d-(jW!rwC>2WY_%Sr~;k>TM~uU!umw8>uN!V`CA3)H#vF zYtPL_p#VTE&oFl)|0bfKx4qtA%0*>GiDbYvhbs76#8&jA8 zL^Q306ABPBrql7nmFmWpk|zPw+GZ#z`FUHUu`3*g1^)3w>67VqG7G_zCiM{w4pazgFvbn`ehI+Q=mk zpQ5+;tn|)evHO_%XIw7dXDDk2*WFriw{;4Y-XruRAxetzxX2?ss)z}i<50%Lh|<2_ bhei1hT;1{!6<`R9CQiGI;`-XhoyY$NSuH&Z literal 0 HcmV?d00001 diff --git a/transport/couchdb.pyc b/transport/couchdb.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2639714fe507bd151eb2901faf2dcaf12053df1d GIT binary patch literal 5447 zcmb7ITW=gm6|U~NczR;zo+X2`+7>IJ$!_96v;+|)kWDOBs|^<0V_{)w8MV91o*wsf zPr7P+LCy==*99baKuA3D19;|@f5Z=f?>jXYv$7L2a#dH?<#bh@?|kPR*Z#fIPX7Mr zt3*wIH9Vi8nAfOMrM6LfD)m+Fscke|+gIDZN^2^utGuR0b+uhrqlVgUC@*cO^pZ-O z>T9LmDD~^@rb?HTZYo_@yI9nk4=kxPkb#B_wC4jYey`VE<~II}s=XfD`2J=U7iL^m zw%xP(K!-2a!i{4++KKOln*$yGa*&0aIvEsYUhW;Y+na;Tgu7X;Lp4%1JUvcfyIuioNjj=f4PF)P$? zKlp8A9=6x@QQ0P{NXMzJq`6gPSY{jOeZgG%gr7iW)cL@dCDSslGJo zykPb;RMhL%xuihV4uWUnxQa)?q{@O|P#PP4KjgiHmEP5rE)soePL>A2#Ngmrijy3a z`GHQ`i}7@)h(|gIQrz%PY;?%iuxsOLPusigAmMg-2$nBU%r*3!2oO52eQEisbD~t= zb=OqqwcrAC_Lo%kr2-)4ZR@EaPTEu_KEUB|SxprmD%Vf3bqK?wp*qu9{&W`R4r^*y zC#bxWnwq{h_`cs%Rh{!V1Hv&{Q+7$ck!k&w$Pd)8Evs5m1D(ElT~{Y{HM~S2W6D>$gsbUeEmdRwK3o9?{vln7mA7#cG%UvXuj;-#`=Ter553~r~C=1;sD&)=R zd{-I(j)`~=k&Lm0^<6xaROTjHc4o>CrgpeCK9-&I`&p6Me*ZcqnLAwMwY?RT+um*e zLvO{u<*n9Ny=Cu?chwhc@&ha{P|R;oIlT=%swx9CYU(wV=_%{0eP5Z6WkO9YR}a|L z!={G~0ga%CtAQIJ1K9S}uqDXYc2ff=n)a~KZKICJPqJ%hG^PQ`9sfMejgGEyg3RJk zq8qGueG!izT}9R9uA|$i0Dh&$(RbO2@~(TL#xn!eq{knjX)be(cguUvt2b6_%U<1+ zi{aN;#AyG3hQ-mu{CGwfhO^?{~Ywk6A8oUkwSJ}KT8 zV}_|?(4HP_Vi3x}6~Z3zsU_8c9bwrw07Of5dffnfGXF#cHVU>hm|2JVNGB6ZN{+*W zOdoRSF(@q6Cfh5*84|<3>v(X4U)UX2pfV>f2lhMg;wV z^_|}b-=U|e==pthbb~y${?#P~E(C-9%6s*R+Dhs6oi~`9t=N|%dvO5#uBF-19v8O} z@sy`bQ>>Cgro(i?z`Lvcb#U5T^w3ee}Mx>KgQDEp-lZYJ$4&NZTn5n@&A1cMr*7duoA-LeMhue zS-$v;w9dDPA7I5x6hlI8*1?+J3);(#1#h}6CBdltc;U}c#J1nTn=U;8Zwg&9*fY5C zIb9|>;#PC0~r^dAzZe1W$mqwi&Wbe%_X8>@tdBiGNX1`grm`QHCH&ViRVW z(E?GQ@HIpwfksgtia%rUb`gkC;`Ac_fu^Bw)mR0*n*ItPcGVMD&Ug + @param row row to be written + """ + + label = params['label'] + row = params['row'] + xchar = None + if 'xchar' is not None: + xchar = params['xchar'] + path = ''.join([self.path,os.sep,label]) + if os.path.exists(path) == False: + os.mkdir(path) ; + path = ''.join([path,os.sep,self.name]) + f = open(path,'a') + row = self.format(row,xchar); + f.write(row) + f.close() + \ No newline at end of file diff --git a/transport/disk.pyc b/transport/disk.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c23d49b4548c10b6baf227265f73964cceca2ce GIT binary patch literal 2855 zcma)8ZEqVz5S~51BaaJNI@? zQ{+fUZ9nnz_#u4Y2f*{pUgAJYgXJCX?aj{4&OGysTYs+h{&@83lc6-97JhGIxSt?$ ze2t7nHZX}KZ^_umB$ACtCN0@$iD@+2veA~jEjE&ok-Q@>MP7*fxY3ckE4HPpONXmn znqBWNJIe~YWx8`{qJKP0b>BSGW3zrPXnR5vyxpl=>Y4gNaNew@eJo8yHs;fzo z!-1qfu7;T}sxrZMJFB5k7eCv}z4-RDw#hfJ!EMj`my??}lLz(8_TnM0M_)O<$8Z-Q zyvUA`IU;4m%;C9_b9frS`v}otCfs$1UL3=AR%erVn)&TG+jP}<=54~Kq#7lD+a{x8 zZ0Yi*Rj;wfx3QNhYicURlX!-FFeu8x4+beu2tlW+v*VG5T#s^nEQt#DE@r;YN;j=) z{{^FPZTdXro)Z|36X~oPz2(78@A1uh5aE_P5ig4C;;!p`Dh01RBT_kaMGv=oh4W4| zQs(G5H~bQmnvnB2M}uM%RzuEdgcluRxJM9hD7b=nzldaC!@b{>rO#P_q6Gn)u?Ay zOxwh!&&qB9SY{JD7-&@w29qkEjrqt_4hDHO1U6|djVXw}#p!QD&u~fC8E9A)`7RqftFy^&UewAe3aXS@U!?CXT*5pA z-~ceEqPUSC2VNlzncsWVx(N|(h4}2oQhh1_Ja3D7J-F?@ax3m1bWrDrS#^U&zOMec z-(cqTYRxSg&ZH=j%OY^=VQ)}kPexUp*gZR(g)UGhW#wt6xMq{WqpttGAAMn1m3cvZ z0A!R*tWTzOanueBtc6nKP16r44nwRF4=m1LYh_MVK0P+Htf8+l91|hkOXg$MC$ph)SJQdN*`^)GH>UWPVDtHAUG3U(J}UO`;C$xY zts}ai%NV0+$2`c}67RJ>)KE%-)W^Lc{zIT8HCfHp`m300{*{k+sv6=Jl583f_c5CVaFc)AO9sjr0==Mcq4?9Sf+>pmYnveFPy( zJ<|itub5SH!JIXxAWxdpcv8AgPeUbTUryhrAm9S4)5_u~o&5(lu%TSS2&~=$tFOdy sxQ!Nw+QfTFN$DYIt}0WqMas?p>lkfJH$p4AMMHYriaBFeqB9r&1` 0 : + collection.delete({_id:self.uid}) + + collecton.update_one({"_id":self.uid},document,True) + diff --git a/transport/mongo.pyc b/transport/mongo.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f52768910b05d7b3b6e683ec556283da6b90e73 GIT binary patch literal 3162 zcmc&$U2oh(6dl{Un-7wcM|xl$8P9eT>+*jKr)CV@H()TE~ld(GQG<$ZP7S7}eBn=0+AbEVFddVDxg=|CC$ z2WsrAbgfmasdQZwJyC45iglF+YBYQc=VyQ88G7hB4j+}p({eA*OyPz;Uk)Hb6Q`z< z?)O#ZOttU&^SP~8+~Ogc)euI}8TfJtgFK!KU*H!zJr$0I#K3+5v8nMqjBS?a3C~WC z^tdh(mz9OpabZAK%awW16A$Q&$jO{ z3EG}a?Wvtc2e|_<0pXb!yih9gRI#Nn7uYsK?!2aUIFOwIstmBTQYW5BY(hOkvjJc? zY1AJOVlO@9^L2Vu#1kX^Gnod+gS>{vvFps#D#b}{mur%`s<1|LqjrVs!7Umj9jY9| zFkdKx`N?h@8lL>fB*$@)*@@+w(O&rA(XG+94{m*a=dNQQ>nxS*Y5>|FGP^>wCjAmg z$8du$iL7H{rbm)PVudyNSUPd_)P^*0h}H;u89i6Ug(Y~mCC5IUJ{LQqC@V4-McmoG z%_VD_{&jD|+r-~Rf6L$Yw!JpnXy_hiK7e548|kSNpSgm3dpXaAE-#eYIqA*Sl0D^p zX5mOEH}jNEg=C+*0+DCdMf2!J45k&|-PO4=Df8S2MH;mr5fWNQYD`7`@Oli^Fgddo z*!}zlx|Up5-gWOoPa-hGh!#YVqmfZGDbqUVkZYqTEt4n;FQOk3zwSust-wAck_@pN zI0y#b#?`L)G{q)?S{}w)%E~9`tST$u@HxsS*w>R1(`Sj16Q)=txqEP&S)Jrab$y!U zxu(~8T$K|Yw}pqGYwJo>9kQ)5C_IIbEh!5}Du8>jhST+gqhV1xjJt5`%2ARrv^zkP z`b2C{QJPwTa`Jmq=;ukmxdMtHDRoz(8D4;*!*ms#g#TkS%YBq56+M}s&)+>NKY&=w zT2@LVB9b>lt@GN9q|xCP1!*C)8fQhikYgm6&3-nL5}os}HB|N@?z`N8yD0Bc!!KIN z)D^U-K`9>z5cUA0w)#jE=l6p~L{nq4rv+{YS8DT;(hiAe20znRz>nUrrt!e>kvW= ze=F#y(OkX>bn+0ehiH~~uA;`d+_)qNI>HOWYex{T^5PXZPkGf7*Nz}vut6esoE6Z` zE7kv&`giCL!>E>PwdB%Qc>V8Le}?%>niKg{;5P)=!0)go0r%yGs>;(Lfm>h(3t~Nm z`z9)jh74K3`;fzqE<)n)JxuPBU`71pt4d)164RS<>|erN)Uuf->{0gwV#)fJ&F~98 zJnO0Ip71+RS#vFUB=cSRR)E)ibu#EQ_++|^x`e8J7R-DPaKb)H1Lm?DW$BPvxc~sG zQ&QFiZmU^EY%Xz=8)w3sT*(qLOPaC>HDA$%gQ{LYNbZ^W7H^0YO|r(_;}6YZCAsXq z>#yeGN0=0nDP0((w;~oHZ^X~g7Pz?4Z`RT)xd5ln<$Ge`Q>C2XO-N>jOiA(f#NrhV h4#{uX_|0q(VD2?n?l;`iGUqP&*Ssz7%8i{1{{RWISJwal literal 0 HcmV?d00001 diff --git a/transport/queue.py b/transport/queue.py new file mode 100644 index 0000000..846f591 --- /dev/null +++ b/transport/queue.py @@ -0,0 +1,200 @@ +import pika +from datetime import datetime +import re +import json +import os +from common import Reader, Writer +import json + +class MessageQueue: + """ + This class hierarchy is designed to handle interactions with a queue server using pika framework (our tests are based on rabbitmq) + :host + :uid identifier of the exchange + :qid identifier of the queue + """ + def __init__(self,**params): + self.host= params['host'] + self.uid = params['uid'] + self.qid = params['qid'] + + def isready(self): + #self.init() + resp = self.connection is not None and self.connection.is_open + self.close() + return resp + def close(self): + if self.connection.is_closed == False : + self.channel.close() + self.connection.close() + +class QueueWriter(MessageQueue,Writer): + """ + This class is designed to publish content to an AMQP (Rabbitmq) + The class will rely on pika to implement this functionality + + We will publish information to a given queue for a given exchange + """ + def __init__(self,**params): + #self.host= params['host'] + #self.uid = params['uid'] + #self.qid = params['queue'] + MessageQueue.__init__(self,**params); + + + def init(self,label=None): + properties = pika.ConnectionParameters(host=self.host) + self.connection = pika.BlockingConnection(properties) + self.channel = self.connection.channel() + self.info = self.channel.exchange_declare(exchange=self.uid,type='direct',durable=True) + if label is None: + self.qhandler = self.channel.queue_declare(queue=self.qid,durable=True) + else: + self.qhandler = self.channel.queue_declare(queue=label,durable=True) + + self.channel.queue_bind(exchange=self.uid,queue=self.qhandler.method.queue) + + + + """ + This function writes a stream of data to the a given queue + @param object object to be written (will be converted to JSON) + @TODO: make this less chatty + """ + def write(self,**params): + xchar = None + if 'xchar' in params: + xchar = params['xchar'] + object = self.format(params['row'],xchar) + + label = params['label'] + self.init(label) + _mode = 2 + if isinstance(object,str): + stream = object + _type = 'text/plain' + else: + stream = json.dumps(object) + if 'type' in params : + _type = params['type'] + else: + _type = 'application/json' + + self.channel.basic_publish( + exchange=self.uid, + routing_key=label, + body=stream, + properties=pika.BasicProperties(content_type=_type,delivery_mode=_mode) + ); + self.close() + + def flush(self,label): + self.init(label) + _mode = 1 #-- Non persistent + self.channel.queue_delete( queue=label); + self.close() + +class QueueReader(MessageQueue,Reader): + """ + This class will read from a queue provided an exchange, queue and host + @TODO: Account for security and virtualhosts + """ + + def __init__(self,**params): + """ + @param host host + @param uid exchange identifier + @param qid queue identifier + """ + + #self.host= params['host'] + #self.uid = params['uid'] + #self.qid = params['qid'] + MessageQueue.__init__(self,**params); + if 'durable' in params : + self.durable = True + else: + self.durable = False + self.size = -1 + self.data = {} + def init(self,qid): + + properties = pika.ConnectionParameters(host=self.host) + self.connection = pika.BlockingConnection(properties) + self.channel = self.connection.channel() + self.channel.exchange_declare(exchange=self.uid,type='direct',durable=True) + + self.info = self.channel.queue_declare(queue=qid,durable=True) + + + def callback(self,channel,method,header,stream): + """ + This is the callback function designed to process the data stream from the queue + + """ + + r = [] + if re.match("^\{|\[",stream) is not None: + r = json.loads(stream) + else: + + r = stream + + qid = self.info.method.queue + if qid not in self.data : + self.data[qid] = [] + + self.data[qid].append(r) + # + # We stop reading when the all the messages of the queue are staked + # + if self.size == len(self.data[qid]) or len(self.data[qid]) == self.info.method.message_count: + self.close() + + def read(self,size=-1): + """ + This function will read, the first message from a queue + @TODO: + Implement channel.basic_get in order to retrieve a single message at a time + Have the number of messages retrieved be specified by size (parameter) + """ + r = {} + self.size = size + # + # We enabled the reader to be able to read from several queues (sequentially for now) + # The qid parameter will be an array of queues the reader will be reading from + # + if isinstance(self.qid,basestring) : + self.qid = [self.qid] + for qid in self.qid: + self.init(qid) + # r[qid] = [] + + if self.info.method.message_count > 0: + + self.channel.basic_consume(self.callback,queue=qid,no_ack=False); + self.channel.start_consuming() + else: + + pass + #self.close() + # r[qid].append( self.data) + + return self.data +class QueueListener(QueueReader): + def init(self,qid): + properties = pika.ConnectionParameters(host=self.host) + self.connection = pika.BlockingConnection(properties) + self.channel = self.connection.channel() + self.channel.exchange_declare(exchange=self.uid,type='direct',durable=True ) + + self.info = self.channel.queue_declare(passive=True,exclusive=True,queue=qid) + + self.channel.queue_bind(exchange=self.uid,queue=self.info.method.queue,routing_key=qid) + #self.callback = callback + def read(self): + + self.init(self.qid) + self.channel.basic_consume(self.callback,queue=self.qid,no_ack=True); + self.channel.start_consuming() + \ No newline at end of file diff --git a/transport/queue.pyc b/transport/queue.pyc new file mode 100644 index 0000000000000000000000000000000000000000..281d8a0d5add3ab3ec7945040a42288132433942 GIT binary patch literal 6988 zcmcIo&vP6{74Df`?P}Ljtk|)!u(1c1A?r$F1p)=-vW;U0k`%FGk6mR@5~gNnTH0C7 z&a9_rZ7WDs6!w*ZBgKXPAV&(0R1P_F;>wACfIk4f@Ab^Ah0OuWmNcz5-97!|_4mH_ zUbp_W-2d&3Uwsy->ZgVOSMiv?qKNV5s8p#9E4wOn)WlI6j&iCtS8cd;zn0o))pA>H zv_)>IbU{rzYNMkjUA56|zO=p3X-UNk>akLfl=}I`qKZ38FDl(tn;5v;H1t%ATKsjS zq1QC@Roqv)uQnYOA8Kk3apkuM%h*NoZ~Uwe9Ejy2cNLbpOeT8ZvZjN=*I}#+5$_gB z32_OvH?%R~mcBL9Gu`Niqg=yd&ZE%K3}!`%r8=CO3UEch##$mQDF61L%}@S@qTlP? z870Px($E-hl;|QXqS20rs#u$3E7P%8=H4jG;#7M{hQ)+YndF)A9wg<+3%z?>k7smo zUl-oYB-xfXO}0aCvj`{pL0)WogM3zar8cGU!a{q)&|uI!^NMgdOv=f<^H{{iQEti} zg;^5!l2~VDvWeAr`KDKnw5K0NSi_cPk9*JTA?xV(B5H)1Z8hca;k4 zi)&vTX=&LP0glzpdPxm&O(hYCHOYU>Jm9;}R)2v7&&!^i=v1eha&FTQ+cyI=Dk;o& zAgzt^iC#0MzOUD|l5!0vAHG!XzQLfD;*~^9Lj^HB*k}N5MUPN$q@{O;T6 zb+!=#*x^`=W4ovF6V>b3ReRVenp#A)18v+E{H+TR%051f@+{MGvat#Q~>rn$1t$nrF+up+g;X2D!Y6K9qca0jw)TX?W$s} z+T6Hp_a{oIUNTWfyU1 zFip(J11?Lzk0cak-n%z$-Sh_j6U1=`%jXFk7-T%lZ{snXMztgUv+uHQP|ai5@|^;jK zrLv`V@hbH}TUy&{yg;i$&tO4n7StFg>WbXu8RK-}GJ)bps>NzI)rL)V<%PKp)zYn;&8m;QUMPkxdo|x1QwXDX6)p!N-x(onr54g#7VP~KZQ4*eQ;n^;()o+i-gG zUOwr?XIn|iuGbL68aOSqxDA=%PBGIW5Oh8*uB2JZI-1@MKisoExDbA@eTGRE%Z{+9 zQ6Bpz@wN3&)Prm)4Tn0FPo@QUyeJcWaIZx5&?mix|Au>1&N1f*eqVQ%oOAAJS4@Iv z=6=K^{s{pm1e6hUj@s_1;!N$9Q45ylDN7rmwMrDiYXOcx1N`%z1tUNQil;fX)gC~C z_7;x|eTymOJs@*I?R8alfeHXnQHc*&57|!qdl|KZQPB?#U4FpnyK0PpZ&73fc)&4` z-B-Klb4WppV}N)`?E>`6YL`~wK2mLmUoPQALdn~MSFqJ}Q9fgqLIQ^(f8f(76I!DY z|Dzw4YtuALGXDhYsK#(QO_NB-XKifq%>NoHg^G)ORsv+fw%!rM4D)!$f1Rz23@eTd z1mRS9tW(fdu|qGSCFr#9BA6_{`Q#bFMS_sclwlTWS*a-tnSt}mCgRy-YAl>NG@`d8 z3M#s^AX2C^zhzsR^5g@$)LiX--EydN9kB30$i30 z9a1TI5e>=_SO9x9EI7h?!cfNADUC=@!RnUhSX({_Nn7RdBda{3$NvF}7ov^LbY@1M zU^Mai3@AI!QD=Q1hjo-wk@aj@JK5DMT@uTtdWvg|BY^NypSOP%#7|jbiZ)ZaJH;j zRNGxu#2nMUqW)49CBQ+$c|+Iq!3YM}p-)HI^jt+X5<~N+r)t^7&bB0}(%Bk|Y%cTp?eE6u~$$SM2ZLY;UJ|3_@4D@EL(v)I4Fcdk7} zzC&L!(octf6vZRkH?8OEo%k$3%9|^*1rTrRX8M+l@ zna&C4NUH;fd)QfVUu$1+gige0i72j8Q1wR$U^hBg;0X*U%i|d$U4e5&*!PYcpsTiQ z2MHw~4y(%dKD#SHBt8K2BoeGGRqzIsZnsr&S-LOMd^tAk6%!(_L2P*l2y^8dG}|ce z-erZqyX?7K^X)&d&^n*3lT4dg@>b(b!Ng`#n9{2bWuKS9mz4)0`+nHutkpfl1sim2 z=@K_^US8mGkjW|VQY5%b@_;l1hq#}s#|}%>@lKJ$eGsAx;g`)OLwloE^)t;7u{XpW z-c(1tbi>OIM*LiR1Mwcrcz8#LgQ%b`@V1mT^d%m9Tk5VjXB}(%bJLUcyf|B1$83M*jPyY;{k}C}#+JT-1A_9evJlTq z3qnLsg+%#ef`pw%;j`Mt#3T{1V4p+V(I6byrCzeP!&f++xYQ*u6hdG89!CIt^!Vzv H)w}-zcc+mT literal 0 HcmV?d00001 diff --git a/transport/s3.py b/transport/s3.py new file mode 100644 index 0000000..9b117db --- /dev/null +++ b/transport/s3.py @@ -0,0 +1,83 @@ +from datetime import datetime +import boto +import botocore +from smart_open import smart_open +from common import Reader, Writer +import json +from common import Reader, Writer + +class s3 : + """ + @TODO: Implement a search function for a file given a bucket?? + """ + def __init__(self,args) : + """ + This function will extract a file or set of files from s3 bucket provided + @param access_key + @param secret_key + @param path location of the file + @param filter filename or filtering elements + """ + try: + self.s3 = boto.connect_s3(args['access_key'],args['secret_key']) + self.bucket = self.s3.get_bucket(args['bucket'].strip(),validate=False) if 'bucket' in args else None + # self.path = args['path'] + self.filter = args['filter'] if 'filter' in args else None + self.filename = args['file'] if 'file' in args else None + + except Exception as e : + self.s3 = None + self.bucket = None + print (e) + + def buckets(self): + # def buckets(self): + pass + # """ + # This function is a wrapper around the bucket list of buckets for s3 + # """ + # return self.s3.get_all_buckets() + + +class s3Reader(s3,Reader) : + """ + Because s3 contains buckets and files, reading becomes a tricky proposition : + - list files if file is None + - stream content if file is Not None + @TODO: support read from all buckets, think about it + """ + def __init__(self,args) : + s3.__init__(self,args) + def files(self): + r = [] + try: + return [item.name for item in self.bucket if item.size > 0] + except Exception as e: + pass + return r + def stream(self,limit=-1): + """ + At this point we should stream a file from a given bucket + """ + key = self.bucket.get_key(self.filename.strip()) + if key is None : + yield None + else: + count = 0 + with smart_open(key) as remote_file: + for line in remote_file: + if count == limit and limit > 0 : + break + yield line + count += 1 + def read(self,limit=-1) : + if self.filename is None : + # + # returning the list of files because no one file was specified. + return self.files() + else: + return self.stream(10) + +class s3Writer(s3,Writer) : + def __init__(self,args) : + s3.__init__(self,args) diff --git a/transport/s3.pyc b/transport/s3.pyc new file mode 100644 index 0000000000000000000000000000000000000000..baa8a005edfa678a42f55262243510209f56b0a8 GIT binary patch literal 3141 zcmcImOK%)S5U!rx9q()$9B>FR5ikjdEKwjqJ_JRGm?XRu1Uk|!8oFUyM-Jx|{)7=*YGz>)Gi=ekJ=&^+>{aIL_{e@3o zfwNt9Cb8OPgu&$TnoCGu>9m$sJla5F;=CRQ!X8{lnsb zR?7?2wmlm4sQ#7Iu7~uylVQ;GiXD$W`@B?iA1f<;&XM^Y1d$5vZD!^mDvz@~5A_pU zCn*O_!h=8=ZNq9^UZGr9MQ9eAaE0T#dXx?IkPllLCv{SUNt$Y7;*s7yv}kl%Yx}Ro zabh=vAg|Iyl7c(zrj~mS?SvITM-Z?zWm4!67P+;o+z7QxvB3s}f#~3Jc|EvRkmhnM zFnyHdnLw4n=+3ik#{#;?z{%@cWh*%;t;$lTHa3e9tI661JZg?)s2E#kV`P}<(%t`UhOt}*x(`A^%p5o*s5IlMwwKJ} zd*f|6HIB0~vvGVJa`Ou7cU$TWb;Wy2z3t7Z0Dmc;HGKrvY~hdq3SM1yw=f&cgFNR_ zcFH+#&Vn^uH01T!{l*~9Rvb&D;<%`Wlbm09UmOpsG>)TF;G@?-4(|F-FD2|8GQ%13 zTfPcT%0)Oqi5IR~r|@8ip_`*K}GH8ln)8NJraTCgaLv0`sdl@7*xZjCI?BAalhP zq81r1A_tf>iivMS3;U;vovJmR&P>LP2D#o9Xo5m+l5-J7m6f9~S*s>C%xu%&raew@ za1w4d&5gLtpsDc?n&N_WCH#^w>EVKqi1!_&?kH@^j`yMF@Zm7Xuw$Ia-FLuj5bY}3 zzRV2wPNJI+-XwH0v@Ud9wA)de>Unwf=o^n7&eO_jIgj4glHPXb8l@KS5Fs9_)#(W@-QB$A!X(LkSXcq!Eemtsg$)Io{yG3^%-Xi?3?sRYxlrXcd;HXuceD^-ciuiAqU=wf` zd~6vmCLC882IpfSezTh7!?1y4GZbV9B+ck_IKyU%cjj6NJM5y)ZgZo=JcwTA)t8u@ zV8W_WXM!dPW}X$9MN)~NFDq#^DXry(TUus>bX=WQUDZ*NBStwtg31eET#ir&oQs7fQ%KVa3lK@v39u%0!uysp9zYIG;;8r- zVl6t)0;wX#S#*wH@jL@qj9Ts{U*KCB9;vxj&+BtEqti?ln7qzJx@gE8-xKl42d4{n z6n3LC&|^0Zo9+APr(Q5KJJv#-%w%=2qqPcq9ELprCM9!WB@qcbB z5!=I2B_-4IP+T}9FA{@ypm!IK9OEmug|WtGQzSnFy5RU{Kw8yWAS*v5(BO-zj3lY- supn~FV2M*F!HxLz7qCJi{-GO?A8{mQexCqJXVnRH3iOmWe|F*g-`&?v=Kufz literal 0 HcmV?d00001 diff --git a/transport/session.py b/transport/session.py new file mode 100644 index 0000000..5ca833a --- /dev/null +++ b/transport/session.py @@ -0,0 +1,66 @@ +from flask import request, session +from datetime import datetime +import re +from common import Reader, Writer +import json + +class HttpRequestReader(Reader): + """ + This class is designed to read data from an Http request file handler provided to us by flask + The file will be heald in memory and processed accordingly + NOTE: This is inefficient and can crash a micro-instance (becareful) + """ + + def __init__(self,**params): + self.file_length = 0 + try: + + #self.file = params['file'] + #self.file.seek(0, os.SEEK_END) + #self.file_length = self.file.tell() + + #print 'size of file ',self.file_length + self.content = params['file'].readlines() + self.file_length = len(self.content) + except Exception, e: + print "Error ... ",e + pass + + def isready(self): + return self.file_length > 0 + def read(self,size =-1): + i = 1 + for row in self.content: + i += 1 + if size == i: + break + yield row + +class HttpSessionWriter(Writer): + """ + This class is designed to write data to a session/cookie + """ + def __init__(self,**params): + """ + @param key required session key + """ + self.session = params['queue'] + self.session['sql'] = [] + self.session['csv'] = [] + self.tablename = re.sub('..+$','',params['filename']) + self.session['uid'] = params['uid'] + #self.xchar = params['xchar'] + + + def format_sql(self,row): + values = "','".join([col.replace('"','').replace("'",'') for col in row]) + return "".join(["INSERT INTO :table VALUES('",values,"');\n"]).replace(':table',self.tablename) + def isready(self): + return True + def write(self,**params): + label = params['label'] + row = params ['row'] + + if label == 'usable': + self.session['csv'].append(self.format(row,',')) + self.session['sql'].append(self.format_sql(row)) From bc8f24d3cc03e17287a6989b11e4601003252425 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 16 Sep 2019 23:09:16 -0500 Subject: [PATCH 008/271] data transport framework for rabbitmq, mongodb, couchdb, ... --- transport/__init__.pyc | Bin 2005 -> 0 bytes transport/common.pyc | Bin 4344 -> 0 bytes transport/couch.pyc | Bin 5405 -> 0 bytes transport/couchdb.pyc | Bin 5447 -> 0 bytes transport/disk.pyc | Bin 2855 -> 0 bytes transport/mongo.pyc | Bin 3162 -> 0 bytes transport/queue.pyc | Bin 6988 -> 0 bytes transport/s3.pyc | Bin 3141 -> 0 bytes 8 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 transport/__init__.pyc delete mode 100644 transport/common.pyc delete mode 100644 transport/couch.pyc delete mode 100644 transport/couchdb.pyc delete mode 100644 transport/disk.pyc delete mode 100644 transport/mongo.pyc delete mode 100644 transport/queue.pyc delete mode 100644 transport/s3.pyc diff --git a/transport/__init__.pyc b/transport/__init__.pyc deleted file mode 100644 index 0311a202464c6bf90217bb05bf7320139c170dbd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2005 zcma)7QEwYX5T3IgJH7^`AOWqA&`P{Gp-Eeih!jN;qO?>Z4b+L0VyQZ6=&cemHw zy%-ByLMlIk7ybeNju+nGo4s>d1thp=H?y;|GvCh4w{HLOPFSA({4%EIYvccy81^Nm zglL30piw|ci;^~FEgH2~a)(BpmApoyH7|E4>r&pMQIGO<8m&{1V53J#mwq97P4vU4 z&z?>O>m26K7E$=MbW)reS=dq=Cw9cc-3Q@`Q&&nH?TX{M%E$7aIGrl-{WKM)DxMZP z)04S4I(ig_r_(!GqgXJKeN+V+jw~5NqJcX;R#{PMwDmEF)@faJ= zUadAxl(nfY#NbIL?d3{qKgT`PTZlBkN(V!LH%b^KllwD+2tpQ#D2={KleCzKTmiX< zkDEETh-H@XMywMR4%MqFH7ZwyKCYN0vT#0jtc&w>Qq@t#xkj$w zj$QBg%y4p%%5$A%dgi?~8G_&bPZ)h^CU)OT{mNwf2bd1;@kYNwB?_4<=o+cnD5bSC zZIbnx3mZ5zQ4ESePr+?u)9)K9+x|oJ2 zy)s@r;|4IU2kklfPMdiQ(SL0I6^8u~lLB!AQlKn()`7uo8lDUS?D-SyVGZM34Er0V zh3EO*W#FY2-Cg2vNXs@Y!2Mfv*(GyGpz;MQpy|@2MXv)|cBt4U*P;tp)*8_L+N?(l zEZ1q$X^iK$$#rN^FWYohd`um;MwcO(Uny92@m%z2;k|$Jvc}s4CDgWoS8F$ zz^B?bAKL;$3MoZU{_XGCRV-$YbcH%Z&`ddzEH7qj8Z=0k{lzY z-po3bCHFwp0Ndmz`#z-W!rYIdv`AeP?Ur-zaFx70SlK&#suu=f&~0sW?yPMF9|i5; zc55@(3)wkgw^v z7e#o1u4Bpy0SuO$)B;JJgAS0&;VnMEc};WvoypR%&rPgzyuw51X~>oI(f&6!pQ(fT UqKBX2xaGTcgKn_l|32vc4Nc@Hx15Ury~h+v>?u-1`1^mLa!ZTHJv zHGYV(SSc$WpZN!fU%^k}D?b3vxivE$cCoTw7`yM(<95}pI``arr~a>--M{_zv%N@7 zK6QLQ!((pw)&IfTqwbc~r`mtH=5Jo;Y!I&a+e7XD zYMA)jIvVChS{z>ZFJEqSyW7LW_*oH;QthJ^Ym*%2I`(bh8(m?I(Dq9tNiQq%!(!HZk)+VTPBa3BYohDg= zrIsi9zaHrkRf2FZz{D?3rW0G*n54-4-itIe$J0^s80XM)78Niyv`~ccmG&zg#vhzj z*uf9;*e|Q%G>Mb^(9bk17xQs4izT8kO*s)2vF--?WRz4o)4AorI^~6&quUu2w&-?n z#54|b+wGd$K;z3doctS~P20j`01I>#p(P*Srar-AoTWA}usl1LS zCUQRxh>0s`z^o(j^-E7#PaW0NAqGA5eD8+3l)Koksgp(Z+M5|pi=5$HiO^b}C@(En zH=|4@15VmDL{#J!2+Ax@`4!I4!CTPm1K`|#hl{#H-0&jq#0%#m+x-)bq(Q53+aL3* z=DzCgtD!#ri&B4{9kZs+hwgyCQHtf%)gR|fcn*z;N!SU7H5FWx7E0GGd)d6Q zdE8Wsw5|aN7I0kiHX!JBIvt4}2ctY9(?DWSIZJZzmQ2bDV?iVO#f;yTe^3>fAA+fa zNfapy5PE>%GmQb2S@XI0&#XT(SPRM)R5msNH+_VmstBW@--|I+C4&+7US?oA9EFEk zww|A2q>74klpz!x6cv^`OiuNjN!*Lix(Txqc1*soYlT1n&ld0T^?9jrPR5K7Gv|J? zru`H)a08MDYruh(S}~NaJ(l)*!Vw`{#Mab@NFnnZHw7N-{ak?rgxgb#aHQk9s#?mz zi-i}H2?tM%X?O>wF;}0yggLzXR}iKJsAN-$o=~CAjdB~b@hybTG-4M=PeF6qBs5xk zBwtDFn9ujK9)!A%@rIe`Nt{F$V3$vSL{+ZVQN7t#$NIuZOfhAeSP6*CCwVOUr(h@f zbQ-1z`l6TQu|AhcIFv*<$7iZJfJ9AvCZGWU4-zawh_PaPAQyaHpd!gouYM0(5wj8`S+f1Q2y0iag$Rqz;6~!gj}F5C<6HPF)ue zfm&b`SbyAB)s_Tl%r@1N9MDT^5i#VxYd$%-qjt~)$ghQ=IQOjUeBlya(2~J6b;d2V z$X(iM+=dtMUQ7r9|NL*hR@x?2=E$0lvH|k&99PhsQpH_3-hWmimog8mNp4`hkQdfx z#=xvNxVXAR34s)h16>42@F}BX2{ zO)<}X2*7wUa|(eg{Q_cVNt*gm3f`ikiHa&VL~IHZf{;JBz)Xhf25~Ac^3~8pN#f56 zEe5GKplEJk>sR|~^$#oEm76PXtt_nU;owU7BA0L#6Pzr0mzX3+e9j{+F@*t9nE4`1 zjSlWYL&6Dst&TRrzRWVVmXe$+U8Z5AB}6&kxvH#1bmc8H-i&g+?AdfehK!GC62IojZD5C@u5CIeZUCCwsft9|D5eiQ_ zz2q$hJBN^iONvh%of#5^w6!`bEt}w#E4G<95v27YYSjzA$^ofQ&k{RKa>+VjB!vkv zW=0d0V#FMJGAgV#YcrJBuV-uUJ(zFkY`veZ?~e`Xx7=Go=hgnw-uK5#OUnGuPkBzTMB+Z69n&?UqNueOxdb*U_XzG7rDHy~PGr_rf* z-tFA)+>&6n!ygd;qlV{R`b{-F3UyvdL@|9f&^CJzaI+X0y`!oXMj6H$@icT@uy;`g zetUmwqPw7~Pp7&PKiF+Dz&Kr$63i#FvU!3ZaR`{6gZC+T9l;|A!IVkdfoo9p@J8cN zR+=whDX3rF=(U&0+b{4)fuOv1yl>#S?S2O^pzx6!V7SHcUB$BLR_3I*&zmIOnAUzsAuji0w$>r8jF;B6SM?EeGdzZ2s#QZh zTAO=y0fTbLem^dvnd^rmJ1i=6X%`ZBc_Fo3@I$II=Or0`pQp({{5s$!bE~3veahsP VPaL9blh|_Mrf%!nfny5|g1AD$6>3yKx}eCBmaAQI z&6(9lA-~jlOt5hC{*I=-5!?9oRvG7JQk1sc zv-&`XPuIiEV?Ex9Z-rX}9ezH@!Y!Q)@?uo%9k<(CgUp1x*+_@D$KhcaPbRtyzeAk zz!B;k*jitLCl9$Dglwc|fCK3S^_x>n=exo3@4)R&n<%6(Beb+ZKml2kKWICb2Px zMVZRvewKDGVMatV%WyJ7B2F5dU;FrB5Kt#Yab zfhkQZV$&j-j&*M3VWWB=cJ)zatg+lhQtH_1{(LTFnfO49(9I&(U7|srj2`Vu2Y@jV z&mYMcdsyGaO-W;}qGv1by(6x-J~@_s^!r(!*?yl>%o>+?ZEpqFP4A}vj<@39@K)=q z-mR-^Y zIGDH`PY7ple!DoR^QH_giNBhUOf4BW4i7SY$f5h7tyG(A zFApmehJE+q&JliLcTyGyS*pX`>1gCg>oBu}YEn1>7iLEwFvz0NzR;YITd@mf*Yc?_ zTi2n?+1JMbF0%WMMnKRhyNDbhahVQZv+x^C27X|S6chpd5)FHZCX@aM;b=fK0x#t8Fq6R0$s8^?k~hP$3YD zZeq1SQJ6!bhB+T>L=Xl23U@nzM0XCh7lMxb&6#d--{xG1dZUKyzhZ!PrQA`Yh z{AlZb_;CW_- z_sUO*R~inYpiw-~gaT)P`eEK=4&a`9vgJ+LE{<@}C83;YMHi@fBw3JViCr97nDpG^ zM&zJB;hF_$hUYdg+V-2CBYQehbeqjO8{tRlI{F10E?;~^dS`jU9ZY?S%aDDWb&%vw zg7$J_L6fcsNiZruD*G`mapiMp(p4XzNnuCkbOzdIs3eN0(JGOzBe(0=n8VF?$jzDF z{L^i_;w^CI*;Uhg(re2mBonjJtD?1_>by#R)x`7atH4!{67K{Fr{Kouy+FcQARv7H zhRe{pswG}UbJ6RB`u~B>nO^1cUGXZg%zjIiPlYH@M=e!V5TM|>DhQ!Lq6eY_t33D} zlEw~;C_Et3M!k2e_9O+wn^`i5UI!U9>MS^^$Luf=$3+`H`R zvq#zI8d~-u8u}Ne3mQ~w)w}Aw4SFHLtROp+K)^V~jX&YMZ_s=n>w9F(nmW4cg5dkJ zk{4+^ttxL}LDm4cJ%AN)d-(jW!rwC>2WY_%Sr~;k>TM~uU!umw8>uN!V`CA3)H#vF zYtPL_p#VTE&oFl)|0bfKx4qtA%0*>GiDbYvhbs76#8&jA8 zL^Q306ABPBrql7nmFmWpk|zPw+GZ#z`FUHUu`3*g1^)3w>67VqG7G_zCiM{w4pazgFvbn`ehI+Q=mk zpQ5+;tn|)evHO_%XIw7dXDDk2*WFriw{;4Y-XruRAxetzxX2?ss)z}i<50%Lh|<2_ bhei1hT;1{!6<`R9CQiGI;`-XhoyY$NSuH&Z diff --git a/transport/couchdb.pyc b/transport/couchdb.pyc deleted file mode 100644 index 2639714fe507bd151eb2901faf2dcaf12053df1d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5447 zcmb7ITW=gm6|U~NczR;zo+X2`+7>IJ$!_96v;+|)kWDOBs|^<0V_{)w8MV91o*wsf zPr7P+LCy==*99baKuA3D19;|@f5Z=f?>jXYv$7L2a#dH?<#bh@?|kPR*Z#fIPX7Mr zt3*wIH9Vi8nAfOMrM6LfD)m+Fscke|+gIDZN^2^utGuR0b+uhrqlVgUC@*cO^pZ-O z>T9LmDD~^@rb?HTZYo_@yI9nk4=kxPkb#B_wC4jYey`VE<~II}s=XfD`2J=U7iL^m zw%xP(K!-2a!i{4++KKOln*$yGa*&0aIvEsYUhW;Y+na;Tgu7X;Lp4%1JUvcfyIuioNjj=f4PF)P$? zKlp8A9=6x@QQ0P{NXMzJq`6gPSY{jOeZgG%gr7iW)cL@dCDSslGJo zykPb;RMhL%xuihV4uWUnxQa)?q{@O|P#PP4KjgiHmEP5rE)soePL>A2#Ngmrijy3a z`GHQ`i}7@)h(|gIQrz%PY;?%iuxsOLPusigAmMg-2$nBU%r*3!2oO52eQEisbD~t= zb=OqqwcrAC_Lo%kr2-)4ZR@EaPTEu_KEUB|SxprmD%Vf3bqK?wp*qu9{&W`R4r^*y zC#bxWnwq{h_`cs%Rh{!V1Hv&{Q+7$ck!k&w$Pd)8Evs5m1D(ElT~{Y{HM~S2W6D>$gsbUeEmdRwK3o9?{vln7mA7#cG%UvXuj;-#`=Ter553~r~C=1;sD&)=R zd{-I(j)`~=k&Lm0^<6xaROTjHc4o>CrgpeCK9-&I`&p6Me*ZcqnLAwMwY?RT+um*e zLvO{u<*n9Ny=Cu?chwhc@&ha{P|R;oIlT=%swx9CYU(wV=_%{0eP5Z6WkO9YR}a|L z!={G~0ga%CtAQIJ1K9S}uqDXYc2ff=n)a~KZKICJPqJ%hG^PQ`9sfMejgGEyg3RJk zq8qGueG!izT}9R9uA|$i0Dh&$(RbO2@~(TL#xn!eq{knjX)be(cguUvt2b6_%U<1+ zi{aN;#AyG3hQ-mu{CGwfhO^?{~Ywk6A8oUkwSJ}KT8 zV}_|?(4HP_Vi3x}6~Z3zsU_8c9bwrw07Of5dffnfGXF#cHVU>hm|2JVNGB6ZN{+*W zOdoRSF(@q6Cfh5*84|<3>v(X4U)UX2pfV>f2lhMg;wV z^_|}b-=U|e==pthbb~y${?#P~E(C-9%6s*R+Dhs6oi~`9t=N|%dvO5#uBF-19v8O} z@sy`bQ>>Cgro(i?z`Lvcb#U5T^w3ee}Mx>KgQDEp-lZYJ$4&NZTn5n@&A1cMr*7duoA-LeMhue zS-$v;w9dDPA7I5x6hlI8*1?+J3);(#1#h}6CBdltc;U}c#J1nTn=U;8Zwg&9*fY5C zIb9|>;#PC0~r^dAzZe1W$mqwi&Wbe%_X8>@tdBiGNX1`grm`QHCH&ViRVW z(E?GQ@HIpwfksgtia%rUb`gkC;`Ac_fu^Bw)mR0*n*ItPcGVMD&UgaaJNI@? zQ{+fUZ9nnz_#u4Y2f*{pUgAJYgXJCX?aj{4&OGysTYs+h{&@83lc6-97JhGIxSt?$ ze2t7nHZX}KZ^_umB$ACtCN0@$iD@+2veA~jEjE&ok-Q@>MP7*fxY3ckE4HPpONXmn znqBWNJIe~YWx8`{qJKP0b>BSGW3zrPXnR5vyxpl=>Y4gNaNew@eJo8yHs;fzo z!-1qfu7;T}sxrZMJFB5k7eCv}z4-RDw#hfJ!EMj`my??}lLz(8_TnM0M_)O<$8Z-Q zyvUA`IU;4m%;C9_b9frS`v}otCfs$1UL3=AR%erVn)&TG+jP}<=54~Kq#7lD+a{x8 zZ0Yi*Rj;wfx3QNhYicURlX!-FFeu8x4+beu2tlW+v*VG5T#s^nEQt#DE@r;YN;j=) z{{^FPZTdXro)Z|36X~oPz2(78@A1uh5aE_P5ig4C;;!p`Dh01RBT_kaMGv=oh4W4| zQs(G5H~bQmnvnB2M}uM%RzuEdgcluRxJM9hD7b=nzldaC!@b{>rO#P_q6Gn)u?Ay zOxwh!&&qB9SY{JD7-&@w29qkEjrqt_4hDHO1U6|djVXw}#p!QD&u~fC8E9A)`7RqftFy^&UewAe3aXS@U!?CXT*5pA z-~ceEqPUSC2VNlzncsWVx(N|(h4}2oQhh1_Ja3D7J-F?@ax3m1bWrDrS#^U&zOMec z-(cqTYRxSg&ZH=j%OY^=VQ)}kPexUp*gZR(g)UGhW#wt6xMq{WqpttGAAMn1m3cvZ z0A!R*tWTzOanueBtc6nKP16r44nwRF4=m1LYh_MVK0P+Htf8+l91|hkOXg$MC$ph)SJQdN*`^)GH>UWPVDtHAUG3U(J}UO`;C$xY zts}ai%NV0+$2`c}67RJ>)KE%-)W^Lc{zIT8HCfHp`m300{*{k+sv6=Jl583f_c5CVaFc)AO9sjr0==Mcq4?9Sf+>pmYnveFPy( zJ<|itub5SH!JIXxAWxdpcv8AgPeUbTUryhrAm9S4)5_u~o&5(lu%TSS2&~=$tFOdy sxQ!Nw+QfTFN$DYIt}0WqMas?p>lkfJH$p4AMMHYriaBFeqB9r&1`cM|xl$8P9eT>+*jKr)CV@H()TE~ld(GQG<$ZP7S7}eBn=0+AbEVFddVDxg=|CC$ z2WsrAbgfmasdQZwJyC45iglF+YBYQc=VyQ88G7hB4j+}p({eA*OyPz;Uk)Hb6Q`z< z?)O#ZOttU&^SP~8+~Ogc)euI}8TfJtgFK!KU*H!zJr$0I#K3+5v8nMqjBS?a3C~WC z^tdh(mz9OpabZAK%awW16A$Q&$jO{ z3EG}a?Wvtc2e|_<0pXb!yih9gRI#Nn7uYsK?!2aUIFOwIstmBTQYW5BY(hOkvjJc? zY1AJOVlO@9^L2Vu#1kX^Gnod+gS>{vvFps#D#b}{mur%`s<1|LqjrVs!7Umj9jY9| zFkdKx`N?h@8lL>fB*$@)*@@+w(O&rA(XG+94{m*a=dNQQ>nxS*Y5>|FGP^>wCjAmg z$8du$iL7H{rbm)PVudyNSUPd_)P^*0h}H;u89i6Ug(Y~mCC5IUJ{LQqC@V4-McmoG z%_VD_{&jD|+r-~Rf6L$Yw!JpnXy_hiK7e548|kSNpSgm3dpXaAE-#eYIqA*Sl0D^p zX5mOEH}jNEg=C+*0+DCdMf2!J45k&|-PO4=Df8S2MH;mr5fWNQYD`7`@Oli^Fgddo z*!}zlx|Up5-gWOoPa-hGh!#YVqmfZGDbqUVkZYqTEt4n;FQOk3zwSust-wAck_@pN zI0y#b#?`L)G{q)?S{}w)%E~9`tST$u@HxsS*w>R1(`Sj16Q)=txqEP&S)Jrab$y!U zxu(~8T$K|Yw}pqGYwJo>9kQ)5C_IIbEh!5}Du8>jhST+gqhV1xjJt5`%2ARrv^zkP z`b2C{QJPwTa`Jmq=;ukmxdMtHDRoz(8D4;*!*ms#g#TkS%YBq56+M}s&)+>NKY&=w zT2@LVB9b>lt@GN9q|xCP1!*C)8fQhikYgm6&3-nL5}os}HB|N@?z`N8yD0Bc!!KIN z)D^U-K`9>z5cUA0w)#jE=l6p~L{nq4rv+{YS8DT;(hiAe20znRz>nUrrt!e>kvW= ze=F#y(OkX>bn+0ehiH~~uA;`d+_)qNI>HOWYex{T^5PXZPkGf7*Nz}vut6esoE6Z` zE7kv&`giCL!>E>PwdB%Qc>V8Le}?%>niKg{;5P)=!0)go0r%yGs>;(Lfm>h(3t~Nm z`z9)jh74K3`;fzqE<)n)JxuPBU`71pt4d)164RS<>|erN)Uuf->{0gwV#)fJ&F~98 zJnO0Ip71+RS#vFUB=cSRR)E)ibu#EQ_++|^x`e8J7R-DPaKb)H1Lm?DW$BPvxc~sG zQ&QFiZmU^EY%Xz=8)w3sT*(qLOPaC>HDA$%gQ{LYNbZ^W7H^0YO|r(_;}6YZCAsXq z>#yeGN0=0nDP0((w;~oHZ^X~g7Pz?4Z`RT)xd5ln<$Ge`Q>C2XO-N>jOiA(f#NrhV h4#{uX_|0q(VD2?n?l;`iGUqP&*Ssz7%8i{1{{RWISJwal diff --git a/transport/queue.pyc b/transport/queue.pyc deleted file mode 100644 index 281d8a0d5add3ab3ec7945040a42288132433942..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6988 zcmcIo&vP6{74Df`?P}Ljtk|)!u(1c1A?r$F1p)=-vW;U0k`%FGk6mR@5~gNnTH0C7 z&a9_rZ7WDs6!w*ZBgKXPAV&(0R1P_F;>wACfIk4f@Ab^Ah0OuWmNcz5-97!|_4mH_ zUbp_W-2d&3Uwsy->ZgVOSMiv?qKNV5s8p#9E4wOn)WlI6j&iCtS8cd;zn0o))pA>H zv_)>IbU{rzYNMkjUA56|zO=p3X-UNk>akLfl=}I`qKZ38FDl(tn;5v;H1t%ATKsjS zq1QC@Roqv)uQnYOA8Kk3apkuM%h*NoZ~Uwe9Ejy2cNLbpOeT8ZvZjN=*I}#+5$_gB z32_OvH?%R~mcBL9Gu`Niqg=yd&ZE%K3}!`%r8=CO3UEch##$mQDF61L%}@S@qTlP? z870Px($E-hl;|QXqS20rs#u$3E7P%8=H4jG;#7M{hQ)+YndF)A9wg<+3%z?>k7smo zUl-oYB-xfXO}0aCvj`{pL0)WogM3zar8cGU!a{q)&|uI!^NMgdOv=f<^H{{iQEti} zg;^5!l2~VDvWeAr`KDKnw5K0NSi_cPk9*JTA?xV(B5H)1Z8hca;k4 zi)&vTX=&LP0glzpdPxm&O(hYCHOYU>Jm9;}R)2v7&&!^i=v1eha&FTQ+cyI=Dk;o& zAgzt^iC#0MzOUD|l5!0vAHG!XzQLfD;*~^9Lj^HB*k}N5MUPN$q@{O;T6 zb+!=#*x^`=W4ovF6V>b3ReRVenp#A)18v+E{H+TR%051f@+{MGvat#Q~>rn$1t$nrF+up+g;X2D!Y6K9qca0jw)TX?W$s} z+T6Hp_a{oIUNTWfyU1 zFip(J11?Lzk0cak-n%z$-Sh_j6U1=`%jXFk7-T%lZ{snXMztgUv+uHQP|ai5@|^;jK zrLv`V@hbH}TUy&{yg;i$&tO4n7StFg>WbXu8RK-}GJ)bps>NzI)rL)V<%PKp)zYn;&8m;QUMPkxdo|x1QwXDX6)p!N-x(onr54g#7VP~KZQ4*eQ;n^;()o+i-gG zUOwr?XIn|iuGbL68aOSqxDA=%PBGIW5Oh8*uB2JZI-1@MKisoExDbA@eTGRE%Z{+9 zQ6Bpz@wN3&)Prm)4Tn0FPo@QUyeJcWaIZx5&?mix|Au>1&N1f*eqVQ%oOAAJS4@Iv z=6=K^{s{pm1e6hUj@s_1;!N$9Q45ylDN7rmwMrDiYXOcx1N`%z1tUNQil;fX)gC~C z_7;x|eTymOJs@*I?R8alfeHXnQHc*&57|!qdl|KZQPB?#U4FpnyK0PpZ&73fc)&4` z-B-Klb4WppV}N)`?E>`6YL`~wK2mLmUoPQALdn~MSFqJ}Q9fgqLIQ^(f8f(76I!DY z|Dzw4YtuALGXDhYsK#(QO_NB-XKifq%>NoHg^G)ORsv+fw%!rM4D)!$f1Rz23@eTd z1mRS9tW(fdu|qGSCFr#9BA6_{`Q#bFMS_sclwlTWS*a-tnSt}mCgRy-YAl>NG@`d8 z3M#s^AX2C^zhzsR^5g@$)LiX--EydN9kB30$i30 z9a1TI5e>=_SO9x9EI7h?!cfNADUC=@!RnUhSX({_Nn7RdBda{3$NvF}7ov^LbY@1M zU^Mai3@AI!QD=Q1hjo-wk@aj@JK5DMT@uTtdWvg|BY^NypSOP%#7|jbiZ)ZaJH;j zRNGxu#2nMUqW)49CBQ+$c|+Iq!3YM}p-)HI^jt+X5<~N+r)t^7&bB0}(%Bk|Y%cTp?eE6u~$$SM2ZLY;UJ|3_@4D@EL(v)I4Fcdk7} zzC&L!(octf6vZRkH?8OEo%k$3%9|^*1rTrRX8M+l@ zna&C4NUH;fd)QfVUu$1+gige0i72j8Q1wR$U^hBg;0X*U%i|d$U4e5&*!PYcpsTiQ z2MHw~4y(%dKD#SHBt8K2BoeGGRqzIsZnsr&S-LOMd^tAk6%!(_L2P*l2y^8dG}|ce z-erZqyX?7K^X)&d&^n*3lT4dg@>b(b!Ng`#n9{2bWuKS9mz4)0`+nHutkpfl1sim2 z=@K_^US8mGkjW|VQY5%b@_;l1hq#}s#|}%>@lKJ$eGsAx;g`)OLwloE^)t;7u{XpW z-c(1tbi>OIM*LiR1Mwcrcz8#LgQ%b`@V1mT^d%m9Tk5VjXB}(%bJLUcyf|B1$83M*jPyY;{k}C}#+JT-1A_9evJlTq z3qnLsg+%#ef`pw%;j`Mt#3T{1V4p+V(I6byrCzeP!&f++xYQ*u6hdG89!CIt^!Vzv H)w}-zcc+mT diff --git a/transport/s3.pyc b/transport/s3.pyc deleted file mode 100644 index baa8a005edfa678a42f55262243510209f56b0a8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3141 zcmcImOK%)S5U!rx9q()$9B>FR5ikjdEKwjqJ_JRGm?XRu1Uk|!8oFUyM-Jx|{)7=*YGz>)Gi=ekJ=&^+>{aIL_{e@3o zfwNt9Cb8OPgu&$TnoCGu>9m$sJla5F;=CRQ!X8{lnsb zR?7?2wmlm4sQ#7Iu7~uylVQ;GiXD$W`@B?iA1f<;&XM^Y1d$5vZD!^mDvz@~5A_pU zCn*O_!h=8=ZNq9^UZGr9MQ9eAaE0T#dXx?IkPllLCv{SUNt$Y7;*s7yv}kl%Yx}Ro zabh=vAg|Iyl7c(zrj~mS?SvITM-Z?zWm4!67P+;o+z7QxvB3s}f#~3Jc|EvRkmhnM zFnyHdnLw4n=+3ik#{#;?z{%@cWh*%;t;$lTHa3e9tI661JZg?)s2E#kV`P}<(%t`UhOt}*x(`A^%p5o*s5IlMwwKJ} zd*f|6HIB0~vvGVJa`Ou7cU$TWb;Wy2z3t7Z0Dmc;HGKrvY~hdq3SM1yw=f&cgFNR_ zcFH+#&Vn^uH01T!{l*~9Rvb&D;<%`Wlbm09UmOpsG>)TF;G@?-4(|F-FD2|8GQ%13 zTfPcT%0)Oqi5IR~r|@8ip_`*K}GH8ln)8NJraTCgaLv0`sdl@7*xZjCI?BAalhP zq81r1A_tf>iivMS3;U;vovJmR&P>LP2D#o9Xo5m+l5-J7m6f9~S*s>C%xu%&raew@ za1w4d&5gLtpsDc?n&N_WCH#^w>EVKqi1!_&?kH@^j`yMF@Zm7Xuw$Ia-FLuj5bY}3 zzRV2wPNJI+-XwH0v@Ud9wA)de>Unwf=o^n7&eO_jIgj4glHPXb8l@KS5Fs9_)#(W@-QB$A!X(LkSXcq!Eemtsg$)Io{yG3^%-Xi?3?sRYxlrXcd;HXuceD^-ciuiAqU=wf` zd~6vmCLC882IpfSezTh7!?1y4GZbV9B+ck_IKyU%cjj6NJM5y)ZgZo=JcwTA)t8u@ zV8W_WXM!dPW}X$9MN)~NFDq#^DXry(TUus>bX=WQUDZ*NBStwtg31eET#ir&oQs7fQ%KVa3lK@v39u%0!uysp9zYIG;;8r- zVl6t)0;wX#S#*wH@jL@qj9Ts{U*KCB9;vxj&+BtEqti?ln7qzJx@gE8-xKl42d4{n z6n3LC&|^0Zo9+APr(Q5KJJv#-%w%=2qqPcq9ELprCM9!WB@qcbB z5!=I2B_-4IP+T}9FA{@ypm!IK9OEmug|WtGQzSnFy5RU{Kw8yWAS*v5(BO-zj3lY- supn~FV2M*F!HxLz7qCJi{-GO?A8{mQexCqJXVnRH3iOmWe|F*g-`&?v=Kufz From e8328e577dd352f70f36480cf07d62a19ac0d491 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 17 Sep 2019 11:21:42 -0500 Subject: [PATCH 009/271] bug fix --- README.md | 24 ++++++++++++++++++------ transport/__init__.py | 17 +++++++++-------- transport/couch.py | 16 ++++++++-------- transport/mongo.py | 2 +- transport/queue.py | 4 ++-- 5 files changed, 38 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 242d21a..1913cfe 100644 --- a/README.md +++ b/README.md @@ -13,12 +13,24 @@ This project implements an abstraction of objects that can have access to a vari The basic usage revolves around a factory class (to be a singleton) import transport - - p = {"uri":"https://your-server:5984","dbname":"mydatabase","doc":"doc_id"} - couchdb = transport.Factory.instance(type='CouchdbReader',args=p) + from transport import factory + # + # importing a mongo reader + args = {"host":":","dbname":"","doc":"",["username":"","password":""]} + mreader = factory.instance(type='mongo.MonoReader',args=args) + # + # reading a document and executing a view + # + document = mreader.read() + result = mreader.view(name) + # + # importing a couchdb reader + args = {"url":":","dbname":"","doc":"","username":"","password":""} + creader = factory.instance(type='couch.CouchReader',args=args) # - # let's execute a view + # Reading a document and executing a view # - result = couchdb.view('view_name/function',key=value) - info = couchdb.read() \ No newline at end of file + document = dreader.read() + result = couchdb.view(id='',view_name=) + diff --git a/transport/__init__.py b/transport/__init__.py index 9b4f540..71d9570 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -22,7 +22,7 @@ The configuration for the data-store is as follows : username:, password:, dbname:, - uid: + doc: } } RabbitMQ: @@ -36,7 +36,7 @@ The configuration for the data-store is as follows : username:, password:, dbname:, - uid:s + doc:s } } @@ -46,11 +46,11 @@ import numpy as np import json import importlib from common import Reader, Writer #, factory -# import disk -# import queue -# import couch -# import mongo -# import s3 +import disk +import queue +import couch +import mongo +import s3 class factory : @staticmethod def instance(**args): @@ -68,12 +68,13 @@ class factory : # @TODO: Make sure objects are serializable, be smart about them !! # aClassName = ''.join([source,'(**params)']) - + else: stream = json.dumps(params) aClassName = ''.join([source,'(**',stream,')']) + try: anObject = eval( aClassName) #setattr(anObject,'name',source) diff --git a/transport/couch.py b/transport/couch.py index 6368a27..d3189f2 100644 --- a/transport/couch.py +++ b/transport/couch.py @@ -10,17 +10,17 @@ import json from common import Reader,Writer class Couch: """ + This class is a wrapper for read/write against couchdb. The class captures common operations for read/write. @param url host & port reference - @param uid user id involved - + @param doc user id involved @param dbname database name (target) """ def __init__(self,**args): url = args['url'] - self.uid = args['uid'] + self.uid = args['doc'] dbname = args['dbname'] if 'username' not in args and 'password' not in args : - self.server = cloudant.CouchDB(url=url) + self.server = cloudant.CouchDB(None,None,url=url) else: self.server = cloudant.CouchDB(args['username'],args['password'],url=url) self.server.connect() @@ -56,10 +56,10 @@ class Couch: def view(self,**args): """ - We are executing a view - :id design document _design/xxxx (provide full name with _design prefix) - :view_name name of the view i.e - :key key to be used to filter the content + The function will execute a view (provivded a user is authenticated) + :id design document _design/xxxx (provide full name with _design prefix) + :view_name name of the view i.e + :key(s) key(s) to be used to filter the content """ document = cloudant.design_document.DesignDocument(self.dbase,args['id']) document.fetch() diff --git a/transport/mongo.py b/transport/mongo.py index 5c11e56..51b6e28 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -23,7 +23,7 @@ class Mongo : else: self.client = MongoClient() - self.uid = args['uid'] #-- document identifier + self.uid = args['doc'] #-- document identifier self.dbname = args['dbname'] self.db = self.client[self.dbname] diff --git a/transport/queue.py b/transport/queue.py index 846f591..64c770a 100644 --- a/transport/queue.py +++ b/transport/queue.py @@ -10,12 +10,12 @@ class MessageQueue: """ This class hierarchy is designed to handle interactions with a queue server using pika framework (our tests are based on rabbitmq) :host - :uid identifier of the exchange + :xid identifier of the exchange :qid identifier of the queue """ def __init__(self,**params): self.host= params['host'] - self.uid = params['uid'] + self.uid = params['xid'] self.qid = params['qid'] def isready(self): From d438f69f02f4168a9a94ae0136fa402c38daa3bd Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 17 Sep 2019 11:28:09 -0500 Subject: [PATCH 010/271] removed transport.py, old implementation --- transport.py | 758 --------------------------------------------------- 1 file changed, 758 deletions(-) delete mode 100755 transport.py diff --git a/transport.py b/transport.py deleted file mode 100755 index c6bd085..0000000 --- a/transport.py +++ /dev/null @@ -1,758 +0,0 @@ -""" - This file implements data transport stuctures in order to allow data to be moved to and from anywhere - We can thus read data from disk and write to the cloud,queue, or couchdb or SQL -""" -from flask import request, session -import os -import pika -import json -import numpy as np -from couchdbkit import Server -import re -from csv import reader -from datetime import datetime -import boto -import botocore -from smart_open import smart_open -""" - @TODO: Write a process by which the class automatically handles reading and creating a preliminary sample and discovers the meta data -""" -class Reader: - def __init__(self): - self.nrows = 0 - self.xchar = None - - def row_count(self): - content = self.read() - return np.sum([1 for row in content]) - """ - This function determines the most common delimiter from a subset of possible delimiters. It uses a statistical approach to guage the distribution of columns for a given delimiter - """ - def delimiter(self,sample): - - m = {',':[],'\t':[],'|':[],'\x3A':[]} - delim = m.keys() - for row in sample: - for xchar in delim: - if row.split(xchar) > 1: - m[xchar].append(len(row.split(xchar))) - else: - m[xchar].append(0) - - - - # - # The delimiter with the smallest variance, provided the mean is greater than 1 - # This would be troublesome if there many broken records sampled - # - m = {id: np.var(m[id]) for id in m.keys() if m[id] != [] and int(np.mean(m[id]))>1} - index = m.values().index( min(m.values())) - xchar = m.keys()[index] - - return xchar - """ - This function determines the number of columns of a given sample - @pre self.xchar is not None - """ - def col_count(self,sample): - - m = {} - i = 0 - - for row in sample: - row = self.format(row) - id = str(len(row)) - #id = str(len(row.split(self.xchar))) - - if id not in m: - m[id] = 0 - m[id] = m[id] + 1 - - index = m.values().index( max(m.values()) ) - ncols = int(m.keys()[index]) - - - return ncols; - """ - This function will clean records of a given row by removing non-ascii characters - @pre self.xchar is not None - """ - def format (self,row): - - if isinstance(row,list) == False: - # - # We've observed sometimes fields contain delimiter as a legitimate character, we need to be able to account for this and not tamper with the field values (unless necessary) - cols = self.split(row) - #cols = row.split(self.xchar) - else: - cols = row ; - return [ re.sub('[^\x00-\x7F,\n,\r,\v,\b,]',' ',col.strip()).strip().replace('"','') for col in cols] - - #if isinstance(row,list) == False: - # return (self.xchar.join(r)).format('utf-8') - #else: - # return r - """ - This function performs a split of a record and tries to attempt to preserve the integrity of the data within i.e accounting for the double quotes. - @pre : self.xchar is not None - """ - def split (self,row): - - pattern = "".join(["(?:^|",self.xchar,")(\"(?:[^\"]+|\"\")*\"|[^",self.xchar,"]*)"]) - return re.findall(pattern,row.replace('\n','')) - -class Writer: - - def format(self,row,xchar): - if xchar is not None and isinstance(row,list): - return xchar.join(row)+'\n' - elif xchar is None and isinstance(row,dict): - row = json.dumps(row) - return row - """ - It is important to be able to archive data so as to insure that growth is controlled - Nothing in nature grows indefinitely neither should data being handled. - """ - def archive(self): - pass - def flush(self): - pass - -""" - This class is designed to read data from an Http request file handler provided to us by flask - The file will be heald in memory and processed accordingly - NOTE: This is inefficient and can crash a micro-instance (becareful) -""" -class HttpRequestReader(Reader): - def __init__(self,**params): - self.file_length = 0 - try: - - #self.file = params['file'] - #self.file.seek(0, os.SEEK_END) - #self.file_length = self.file.tell() - - #print 'size of file ',self.file_length - self.content = params['file'].readlines() - self.file_length = len(self.content) - except Exception, e: - print "Error ... ",e - pass - - def isready(self): - return self.file_length > 0 - def read(self,size =-1): - i = 1 - for row in self.content: - i += 1 - if size == i: - break - yield row - -""" - This class is designed to write data to a session/cookie -""" -class HttpSessionWriter(Writer): - """ - @param key required session key - """ - def __init__(self,**params): - self.session = params['queue'] - self.session['sql'] = [] - self.session['csv'] = [] - self.tablename = re.sub('..+$','',params['filename']) - self.session['uid'] = params['uid'] - #self.xchar = params['xchar'] - - - def format_sql(self,row): - values = "','".join([col.replace('"','').replace("'",'') for col in row]) - return "".join(["INSERT INTO :table VALUES('",values,"');\n"]).replace(':table',self.tablename) - def isready(self): - return True - def write(self,**params): - label = params['label'] - row = params ['row'] - - if label == 'usable': - self.session['csv'].append(self.format(row,',')) - self.session['sql'].append(self.format_sql(row)) - -""" - This class is designed to read data from disk (location on hard drive) - @pre : isready() == True -""" -class DiskReader(Reader) : - """ - @param path absolute path of the file to be read - """ - def __init__(self,**params): - Reader.__init__(self) - self.path = params['path'] ; - - def isready(self): - return os.path.exists(self.path) - """ - This function reads the rows from a designated location on disk - @param size number of rows to be read, -1 suggests all rows - """ - def read(self,size=-1): - f = open(self.path,'rU') - i = 1 - for row in f: - - i += 1 - if size == i: - break - yield row - f.close() -""" - This function writes output to disk in a designated location -""" -class DiskWriter(Writer): - def __init__(self,**params): - if 'path' in params: - self.path = params['path'] - else: - self.path = None - if 'name' in params: - self.name = params['name']; - else: - self.name = None - if os.path.exists(self.path) == False: - os.mkdir(self.path) - """ - This function determines if the class is ready for execution or not - i.e it determines if the preconditions of met prior execution - """ - def isready(self): - - p = self.path is not None and os.path.exists(self.path) - q = self.name is not None - return p and q - """ - This function writes a record to a designated file - @param label - @param row row to be written - """ - def write(self,**params): - label = params['label'] - row = params['row'] - xchar = None - if 'xchar' is not None: - xchar = params['xchar'] - path = ''.join([self.path,os.sep,label]) - if os.path.exists(path) == False: - os.mkdir(path) ; - path = ''.join([path,os.sep,self.name]) - f = open(path,'a') - row = self.format(row,xchar); - f.write(row) - f.close() -""" - This class hierarchy is designed to handle interactions with a queue server using pika framework (our tests are based on rabbitmq) -""" -class MessageQueue: - def __init__(self,**params): - self.host= params['host'] - self.uid = params['uid'] - self.qid = params['qid'] - - def isready(self): - #self.init() - resp = self.connection is not None and self.connection.is_open - self.close() - return resp - def close(self): - if self.connection.is_closed == False : - self.channel.close() - self.connection.close() -""" - This class is designed to publish content to an AMQP (Rabbitmq) - The class will rely on pika to implement this functionality - - We will publish information to a given queue for a given exchange -""" - -class QueueWriter(MessageQueue,Writer): - def __init__(self,**params): - #self.host= params['host'] - #self.uid = params['uid'] - #self.qid = params['queue'] - MessageQueue.__init__(self,**params); - - - def init(self,label=None): - properties = pika.ConnectionParameters(host=self.host) - self.connection = pika.BlockingConnection(properties) - self.channel = self.connection.channel() - self.info = self.channel.exchange_declare(exchange=self.uid,type='direct',durable=True) - if label is None: - self.qhandler = self.channel.queue_declare(queue=self.qid,durable=True) - else: - self.qhandler = self.channel.queue_declare(queue=label,durable=True) - - self.channel.queue_bind(exchange=self.uid,queue=self.qhandler.method.queue) - - - - """ - This function writes a stream of data to the a given queue - @param object object to be written (will be converted to JSON) - @TODO: make this less chatty - """ - def write(self,**params): - xchar = None - if 'xchar' in params: - xchar = params['xchar'] - object = self.format(params['row'],xchar) - - label = params['label'] - self.init(label) - _mode = 2 - if isinstance(object,str): - stream = object - _type = 'text/plain' - else: - stream = json.dumps(object) - if 'type' in params : - _type = params['type'] - else: - _type = 'application/json' - - self.channel.basic_publish( - exchange=self.uid, - routing_key=label, - body=stream, - properties=pika.BasicProperties(content_type=_type,delivery_mode=_mode) - ); - self.close() - - def flush(self,label): - self.init(label) - _mode = 1 #-- Non persistent - self.channel.queue_delete( queue=label); - self.close() - -""" - This class will read from a queue provided an exchange, queue and host - @TODO: Account for security and virtualhosts -""" -class QueueReader(MessageQueue,Reader): - """ - @param host host - @param uid exchange identifier - @param qid queue identifier - """ - def __init__(self,**params): - #self.host= params['host'] - #self.uid = params['uid'] - #self.qid = params['qid'] - MessageQueue.__init__(self,**params); - if 'durable' in params : - self.durable = True - else: - self.durable = False - self.size = -1 - self.data = {} - def init(self,qid): - - properties = pika.ConnectionParameters(host=self.host) - self.connection = pika.BlockingConnection(properties) - self.channel = self.connection.channel() - self.channel.exchange_declare(exchange=self.uid,type='direct',durable=True) - - self.info = self.channel.queue_declare(queue=qid,durable=True) - - - - """ - This is the callback function designed to process the data stream from the queue - - """ - def callback(self,channel,method,header,stream): - - r = [] - if re.match("^\{|\[",stream) is not None: - r = json.loads(stream) - else: - - r = stream - - qid = self.info.method.queue - if qid not in self.data : - self.data[qid] = [] - - self.data[qid].append(r) - # - # We stop reading when the all the messages of the queue are staked - # - if self.size == len(self.data[qid]) or len(self.data[qid]) == self.info.method.message_count: - self.close() - - """ - This function will read, the first message from a queue - @TODO: - Implement channel.basic_get in order to retrieve a single message at a time - Have the number of messages retrieved be specified by size (parameter) - """ - def read(self,size=-1): - r = {} - self.size = size - # - # We enabled the reader to be able to read from several queues (sequentially for now) - # The qid parameter will be an array of queues the reader will be reading from - # - if isinstance(self.qid,basestring) : - self.qid = [self.qid] - for qid in self.qid: - self.init(qid) - # r[qid] = [] - - if self.info.method.message_count > 0: - - self.channel.basic_consume(self.callback,queue=qid,no_ack=False); - self.channel.start_consuming() - else: - - pass - #self.close() - # r[qid].append( self.data) - - return self.data -class QueueListener(QueueReader): - def init(self,qid): - properties = pika.ConnectionParameters(host=self.host) - self.connection = pika.BlockingConnection(properties) - self.channel = self.connection.channel() - self.channel.exchange_declare(exchange=self.uid,type='direct',durable=True ) - - self.info = self.channel.queue_declare(passive=True,exclusive=True,queue=qid) - - self.channel.queue_bind(exchange=self.uid,queue=self.info.method.queue,routing_key=qid) - #self.callback = callback - def read(self): - - self.init(self.qid) - self.channel.basic_consume(self.callback,queue=self.qid,no_ack=True); - self.channel.start_consuming() - -""" - This class is designed to write output as sql insert statements - The class will inherit from DiskWriter with minor adjustments - @TODO: Include script to create the table if need be using the upper bound of a learner -""" -class SQLDiskWriter(DiskWriter): - def __init__(self,**args): - DiskWriter.__init__(self,**args) - self.tablename = re.sub('\..+$','',self.name).replace(' ','_') - """ - @param label - @param row - @param xchar - """ - def write(self,**args): - label = args['label'] - row = args['row'] - - if label == 'usable': - values = "','".join([col.replace('"','').replace("'",'') for col in row]) - row = "".join(["INSERT INTO :table VALUES('",values,"');\n"]).replace(':table',self.tablename) - - args['row'] = row - DiskWriter.write(self,**args) -class Couchdb: - """ - @param uri host & port reference - @param uid user id involved - - @param dbname database name (target) - """ - def __init__(self,**args): - uri = args['uri'] - self.uid = args['uid'] - dbname = args['dbname'] - self.server = Server(uri=uri) - self.dbase = self.server.get_db(dbname) - if self.dbase.doc_exist(self.uid) == False: - self.dbase.save_doc({"_id":self.uid}) - """ - Insuring the preconditions are met for processing - """ - def isready(self): - p = self.server.info() != {} - if p == False or self.dbase.dbname not in self.server.all_dbs(): - return False - # - # At this point we are sure that the server is connected - # We are also sure that the database actually exists - # - q = self.dbase.doc_exist(self.uid) - if q == False: - return False - return True - def view(self,id,**args): - r =self.dbase.view(id,**args) - r = r.all() - return r[0]['value'] if len(r) > 0 else [] - -""" - This function will read an attachment from couchdb and return it to calling code. The attachment must have been placed before hand (otherwise oops) - @T: Account for security & access control -""" -class CouchdbReader(Couchdb,Reader): - """ - @param filename filename (attachment) - """ - def __init__(self,**args): - # - # setting the basic parameters for - Couchdb.__init__(self,**args) - if 'filename' in args : - self.filename = args['filename'] - else: - self.filename = None - - def isready(self): - # - # Is the basic information about the database valid - # - p = Couchdb.isready(self) - - if p == False: - return False - # - # The database name is set and correct at this point - # We insure the document of the given user has the requested attachment. - # - - doc = self.dbase.get(self.uid) - - if '_attachments' in doc: - r = self.filename in doc['_attachments'].keys() - - else: - r = False - - return r - def stream(self): - content = self.dbase.fetch_attachment(self.uid,self.filename).split('\n') ; - i = 1 - for row in content: - yield row - if size > 0 and i == size: - break - i = i + 1 - - def read(self,size=-1): - if self.filename is not None: - self.stream() - else: - return self.basic_read() - def basic_read(self): - document = self.dbase.get(self.uid) - del document['_id'], document['_rev'] - return document -""" - This class will write on a couchdb document provided a scope - The scope is the attribute that will be on the couchdb document -""" -class CouchdbWriter(Couchdb,Writer): - """ - @param uri host & port reference - @param uid user id involved - @param filename filename (attachment) - @param dbname database name (target) - """ - def __init__(self,**args): - - Couchdb.__init__(self,**args) - uri = args['uri'] - self.uid = args['uid'] - if 'filename' in args: - self.filename = args['filename'] - else: - self.filename = None - dbname = args['dbname'] - self.server = Server(uri=uri) - self.dbase = self.server.get_db(dbname) - # - # If the document doesn't exist then we should create it - # - - """ - write a given attribute to a document database - @param label scope of the row repair|broken|fixed|stats - @param row row to be written - """ - def write(self,**params): - - document = self.dbase.get(self.uid) - label = params['label'] - row = params['row'] - if label not in document : - document[label] = [] - document[label].append(row) - self.dbase.save_doc(document) - def flush(self,**params) : - - size = params['size'] if 'size' in params else 0 - has_changed = False - document = self.dbase.get(self.uid) - for key in document: - if key not in ['_id','_rev','_attachments'] : - content = document[key] - else: - continue - if isinstance(content,list) and size > 0: - index = len(content) - size - content = content[index:] - document[key] = content - - else: - document[key] = {} - has_changed = True - - self.dbase.save_doc(document) - - def archive(self,params=None): - document = self.dbase.get(self.uid) - content = {} - _doc = {} - for id in document: - if id in ['_id','_rev','_attachments'] : - _doc[id] = document[id] - else: - content[id] = document[id] - - content = json.dumps(content) - document= _doc - now = str(datetime.today()) - - name = '-'.join([document['_id'] , now,'.json']) - self.dbase.save_doc(document) - self.dbase.put_attachment(document,content,name,'application/json') -class s3 : - """ - @TODO: Implement a search function for a file given a bucket?? - """ - def __init__(self,args) : - """ - This function will extract a file or set of files from s3 bucket provided - @param access_key - @param secret_key - @param path location of the file - @param filter filename or filtering elements - """ - try: - self.s3 = boto.connect_s3(args['access_key'],args['secret_key']) - self.bucket = self.s3.get_bucket(args['bucket'].strip(),validate=False) if 'bucket' in args else None - # self.path = args['path'] - self.filter = args['filter'] if 'filter' in args else None - self.filename = args['file'] if 'file' in args else None - - except Exception as e : - self.s3 = None - self.bucket = None - print e - def buckets(self): - """ - This function is a wrapper around the bucket list of buckets for s3 - - """ - return self.s3.get_all_buckets() - - -class s3Reader(s3,Reader) : - """ - Because s3 contains buckets and files, reading becomes a tricky proposition : - - list files if file is None - - stream content if file is Not None - @TODO: support read from all buckets, think about it - """ - def __init__(self,args) : - s3.__init__(self,args) - def files(self): - r = [] - try: - return [item.name for item in self.bucket if item.size > 0] - except Exception as e: - pass - return r - def stream(self,limit=-1): - """ - At this point we should stream a file from a given bucket - """ - key = self.bucket.get_key(self.filename.strip()) - if key is None : - yield None - else: - count = 0 - with smart_open(key) as remote_file: - for line in remote_file: - if count == limit and limit > 0 : - break - yield line - count += 1 - def read(self,limit=-1) : - if self.filename is None : - # - # returning the list of files because no one file was specified. - return self.files() - else: - return self.stream(10) -""" - This class acts as a factory to be able to generate an instance of a Reader/Writer - Against a Queue,Disk,Cloud,Couchdb - The class doesn't enforce parameter validation, thus any error with the parameters sent will result in a null Object -""" -class Factory: - def instance(self,**args): - source = args['type'] - params = args['args'] - anObject = None - - if source in ['HttpRequestReader','HttpSessionWriter']: - # - # @TODO: Make sure objects are serializable, be smart about them !! - # - aClassName = ''.join([source,'(**params)']) - - - else: - - stream = json.dumps(params) - aClassName = ''.join([source,'(**',stream,')']) - try: - - - anObject = eval( aClassName) - #setattr(anObject,'name',source) - except Exception,e: - print ['Error ',e] - return anObject -class s3Writer(s3,Writer) : - def __init__(self,args) : - s3.__init__(self,args) - -""" - This class implements a data-source handler that is intended to be used within the context of data processing, it allows to read/write anywhere transparently. - The class is a facade to a heterogeneous class hierarchy and thus simplifies how the calling code interacts with the class hierarchy -""" -class DataSource: - def __init__(self,sourceType='Disk',outputType='Disk',params={}): - self.Input = DataSourceFactory.instance(type=sourceType,args=params) - self.Output= DataSourceFactory.instance(type=outputType,args=params) - def read(self,size=-1): - return self.Input.read(size) - def write(self,**args): - self.Output.write(**args) -conf = json.loads(open('config.json').read()) -#x = s3Reader( dict(conf,**{'bucket':'com.phi.sample.data','file':'Sample-Spreadsheet-5000-rows.csv'})) -x = s3Reader(conf) -print conf -print x.bucket.get_all_keys() -# r = x.read() -# for item in r : -# print item -#print buckets[1].get_key('Sample-Spreadsheet-5000-rows.csv') From fc59dcc8136d13bd97c50c1ee6be5bf05ee5f120 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 17 Sep 2019 11:53:44 -0500 Subject: [PATCH 011/271] documentation and housekeeping work --- .gitignore | 1 + README.md | 13 ++++++++++++- setup.py | 10 +++++++--- 3 files changed, 20 insertions(+), 4 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/README.md b/README.md index 1913cfe..4e862af 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,23 @@ # Introduction -This project implements an abstraction of objects that can have access to a variety of data stores, implementing read/write functions associated and specific to the data-sources. The classes implement functionalities against : +This project implements an abstraction of objects that can have access to a variety of data stores, implementing read/write with a simple interface against specific various data-sources. The supported data sources implement functionalities against : - Rabbitmq-server - Couchdb-server + - Mongodb-server - Http Session : {csv,tab,pipe,sql} - Disk{Reader|Writer} : csv, tab, pipe, sql on disk +Such an interface is used to facilitate data transport in and out of a store for whatever an application may need (log, session management, ...) + +### Installation + +Within the virtual environment perform the following command: + + pip install git+https://dev.the-phi.com/git/steve/data-transport.git + +Binaries and eggs will be provided later on + ### Usage diff --git a/setup.py b/setup.py index acd5d78..78ceb07 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,9 @@ This is a build file for the """ from setuptools import setup, find_packages - +import os +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() setup( name = "data-transport", version = "1.0", @@ -10,10 +12,12 @@ setup( author_email = "steve@the-phi.com", license = "MIT", packages=['transport'], + keywords=['mongodb','couchdb','rabbitmq','file','read','write','s3'], install_requires = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open'], - + url="https://dev.the-phi.com/git/steve/data-transport.git", use_2to3=True, - convert_2to3_doctests=['src/your/module/README.txt'], + long_description=read('README.md'), + convert_2to3_doctests=['README.md'], use_2to3_fixers=['your.fixers'], use_2to3_exclude_fixers=['lib2to3.fixes.fix_import'], ) From 0c6d6956aa350637ac515d2c161f39fba08cedf9 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 17 Sep 2019 12:00:45 -0500 Subject: [PATCH 012/271] housekeeping/documentation --- transport/common.py | 2 +- transport/mongo.py | 6 ++++++ transport/queue.py | 7 +++++++ transport/s3.py | 6 ++++++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/transport/common.py b/transport/common.py index c55141b..993a961 100644 --- a/transport/common.py +++ b/transport/common.py @@ -6,7 +6,7 @@ This module is designed to serve as a wrapper to a set of supported data stores - couchdb - mongodb - Files (character delimited) - - Queues (RabbmitMq) + - Queues (Rabbmitmq) - Session (Flask) - s3 The supported operations are read/write and providing meta data to the calling code diff --git a/transport/mongo.py b/transport/mongo.py index 51b6e28..48235e8 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -1,3 +1,9 @@ +""" +Data Transport - 1.0 +Steve L. Nyemba, The Phi Technology LLC + +This file is a wrapper around mongodb for reading/writing content against a mongodb server and executing views (mapreduce) +""" from pymongo import MongoClient # from transport import Reader,Writer from common import Reader, Writer diff --git a/transport/queue.py b/transport/queue.py index 64c770a..1199687 100644 --- a/transport/queue.py +++ b/transport/queue.py @@ -1,3 +1,10 @@ +""" +Data Transport - 1.0 +Steve L. Nyemba, The Phi Technology LLC + +This file is a wrapper around rabbitmq server for reading and writing content to a queue (exchange) + +""" import pika from datetime import datetime import re diff --git a/transport/s3.py b/transport/s3.py index 9b117db..12412d5 100644 --- a/transport/s3.py +++ b/transport/s3.py @@ -1,3 +1,9 @@ +""" +Data Transport - 1.0 +Steve L. Nyemba, The Phi Technology LLC + +This file is a wrapper around s3 bucket provided by AWS for reading and writing content +""" from datetime import datetime import boto import botocore From 92d4e952e1cf9fac28f068f40949a78ff2978954 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 4 Nov 2019 21:26:28 -0600 Subject: [PATCH 013/271] bug fix --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 78ceb07..df614a5 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,6 @@ setup( use_2to3=True, long_description=read('README.md'), convert_2to3_doctests=['README.md'], - use_2to3_fixers=['your.fixers'], + #use_2to3_fixers=['your.fixers'], use_2to3_exclude_fixers=['lib2to3.fixes.fix_import'], ) From d0a0334f7ad0c8f51a09f0fc911463f93b951ab6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 4 Nov 2019 21:51:20 -0600 Subject: [PATCH 014/271] compatibility to python 3.6 ... --- transport/__init__.py | 28 ++++++++++++++++++++-------- transport/couch.py | 6 +++++- transport/mongo.py | 6 +++++- transport/queue.py | 14 +++++++++----- transport/s3.py | 8 ++++++-- 5 files changed, 45 insertions(+), 17 deletions(-) diff --git a/transport/__init__.py b/transport/__init__.py index 71d9570..41ba63a 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -45,12 +45,24 @@ __author__ = 'The Phi Technology' import numpy as np import json import importlib -from common import Reader, Writer #, factory -import disk -import queue -import couch -import mongo -import s3 +import sys + +if sys.version_info[0] > 2 : + from transport.common import Reader, Writer #, factory + from transport import disk + from transport import queue as queue + from transport import couch as couch + from transport import mongo as mongo + from transport import s3 as s3 +else: + from common import Reader, Writer #, factory + import disk + import queue + import couch + import mongo + import s3 + + class factory : @staticmethod def instance(**args): @@ -78,7 +90,7 @@ class factory : try: anObject = eval( aClassName) #setattr(anObject,'name',source) - except Exception,e: + except Exception as e: print ['Error ',e] return anObject @@ -207,4 +219,4 @@ class factory : # #setattr(anObject,'name',source) # except Exception,e: # print ['Error ',e] -# return anObject \ No newline at end of file +# return anObject diff --git a/transport/couch.py b/transport/couch.py index d3189f2..7aaf93e 100644 --- a/transport/couch.py +++ b/transport/couch.py @@ -7,7 +7,11 @@ This file is a wrapper around couchdb using IBM Cloudant SDK that has an interfa """ import cloudant import json -from common import Reader,Writer +import sys +if sys.version_info[0] > 2 : + from transport.common import Reader, Writer +else: + from common import Reader, Writer class Couch: """ This class is a wrapper for read/write against couchdb. The class captures common operations for read/write. diff --git a/transport/mongo.py b/transport/mongo.py index 48235e8..3854206 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -6,7 +6,11 @@ This file is a wrapper around mongodb for reading/writing content against a mong """ from pymongo import MongoClient # from transport import Reader,Writer -from common import Reader, Writer +import sys +if sys.version_info[0] > 2 : + from transport.common import Reader, Writer +else: + from common import Reader, Writer import json class Mongo : """ diff --git a/transport/queue.py b/transport/queue.py index 1199687..0b5228c 100644 --- a/transport/queue.py +++ b/transport/queue.py @@ -10,7 +10,11 @@ from datetime import datetime import re import json import os -from common import Reader, Writer +import sys +if sys.version_info[0] > 2 : + from transport.common import Reader, Writer +else: + from common import Reader, Writer import json class MessageQueue: @@ -31,9 +35,9 @@ class MessageQueue: self.close() return resp def close(self): - if self.connection.is_closed == False : - self.channel.close() - self.connection.close() + if self.connection.is_closed == False : + self.channel.close() + self.connection.close() class QueueWriter(MessageQueue,Writer): """ @@ -204,4 +208,4 @@ class QueueListener(QueueReader): self.init(self.qid) self.channel.basic_consume(self.callback,queue=self.qid,no_ack=True); self.channel.start_consuming() - \ No newline at end of file + diff --git a/transport/s3.py b/transport/s3.py index 12412d5..19d98b6 100644 --- a/transport/s3.py +++ b/transport/s3.py @@ -8,9 +8,13 @@ from datetime import datetime import boto import botocore from smart_open import smart_open -from common import Reader, Writer +import sys +if sys.version_info[0] > 2 : + from transport.common import Reader, Writer +else: + from common import Reader, Writer import json -from common import Reader, Writer + class s3 : """ From ac64f8de95ec73481eb2f09393e26a1e05c2f69b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 5 Nov 2019 16:04:54 -0600 Subject: [PATCH 015/271] bug fixes --- transport/__init__.py | 2 +- transport/common.py | 12 ++++++------ transport/queue.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/transport/__init__.py b/transport/__init__.py index 41ba63a..2502240 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -91,7 +91,7 @@ class factory : anObject = eval( aClassName) #setattr(anObject,'name',source) except Exception as e: - print ['Error ',e] + print(['Error ',e]) return anObject # class Reader: diff --git a/transport/common.py b/transport/common.py index 993a961..e49fbdb 100644 --- a/transport/common.py +++ b/transport/common.py @@ -39,7 +39,7 @@ class Reader: """ m = {',':[],'\t':[],'|':[],'\x3A':[]} - delim = m.keys() + delim = list(m.keys()) for row in sample: for xchar in delim: if row.split(xchar) > 1: @@ -53,9 +53,9 @@ class Reader: # The delimiter with the smallest variance, provided the mean is greater than 1 # This would be troublesome if there many broken records sampled # - m = {id: np.var(m[id]) for id in m.keys() if m[id] != [] and int(np.mean(m[id]))>1} - index = m.values().index( min(m.values())) - xchar = m.keys()[index] + m = {id: np.var(m[id]) for id in list(m.keys()) if m[id] != [] and int(np.mean(m[id]))>1} + index = list(m.values()).index( min(m.values())) + xchar = list(m.keys())[index] return xchar def col_count(self,sample): @@ -76,8 +76,8 @@ class Reader: m[id] = 0 m[id] = m[id] + 1 - index = m.values().index( max(m.values()) ) - ncols = int(m.keys()[index]) + index = list(m.values()).index( max(m.values()) ) + ncols = int(list(m.keys())[index]) return ncols; diff --git a/transport/queue.py b/transport/queue.py index 0b5228c..eccde1e 100644 --- a/transport/queue.py +++ b/transport/queue.py @@ -175,7 +175,7 @@ class QueueReader(MessageQueue,Reader): # We enabled the reader to be able to read from several queues (sequentially for now) # The qid parameter will be an array of queues the reader will be reading from # - if isinstance(self.qid,basestring) : + if isinstance(self.qid,str) : self.qid = [self.qid] for qid in self.qid: self.init(qid) From 008ae6edf96c6aa181945b2a251b22ce300b4970 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 5 Nov 2019 16:05:06 -0600 Subject: [PATCH 016/271] bug fix with mongodb --- transport/mongo.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/transport/mongo.py b/transport/mongo.py index 3854206..e363008 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -65,8 +65,10 @@ class MongoWriter(Mongo,Writer): def write(self,**args): # document = self.db[self.uid].find() collection = self.db[self.uid] - collection.update_one() - self.db[self.uid].insert_one(args['row']) + if type(args['row']) == list : + self.db[self.uid].insert_many(args['row']) + else: + self.db[self.uid].insert_one(args['row']) def set(self,document): collection = self.db[self.uid] if collection.count_document() > 0 : From b0380be86117a434102b260177096def98d6a998 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 1 Dec 2019 21:23:54 -0600 Subject: [PATCH 017/271] bug fix with file writer --- setup.py | 42 ++++++++++++++++++++++++++---------------- transport/disk.py | 15 +++++++++------ 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/setup.py b/setup.py index df614a5..fbcde4c 100644 --- a/setup.py +++ b/setup.py @@ -3,21 +3,31 @@ This is a build file for the """ from setuptools import setup, find_packages import os +import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -setup( - name = "data-transport", - version = "1.0", - author = "The Phi Technology LLC", - author_email = "steve@the-phi.com", - license = "MIT", - packages=['transport'], - keywords=['mongodb','couchdb','rabbitmq','file','read','write','s3'], - install_requires = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open'], - url="https://dev.the-phi.com/git/steve/data-transport.git", - use_2to3=True, - long_description=read('README.md'), - convert_2to3_doctests=['README.md'], - #use_2to3_fixers=['your.fixers'], - use_2to3_exclude_fixers=['lib2to3.fixes.fix_import'], - ) +args = {"name":"data-transport","version":"1.0.0","author":"The Phi Technology LLC","author_email":"info@the-phi.com","license":"MIT","packages":["transport"]} +args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3'] +args["install_requires"] = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open'] +args["url"] = "https://dev.the-phi.com/git/steve/data-transport.git" + +if sys.version_info[0] == 2 : + args['use_2to3'] = True + args['use_2to3_exclude_fixers']=['lib2to3.fixes.fix_import'] +setup(**args) +# setup( +# name = "data-transport", +# version = "1.0", +# author = "The Phi Technology LLC", +# author_email = "steve@the-phi.com", +# license = "MIT", +# packages=['transport'], +# keywords=['mongodb','couchdb','rabbitmq','file','read','write','s3'], +# install_requires = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open'], +# url="https://dev.the-phi.com/git/steve/data-transport.git", +# use_2to3=True, +# long_description=read('README.md'), +# convert_2to3_doctests=['README.md'], +# #use_2to3_fixers=['your.fixers'], +# use_2to3_exclude_fixers=['lib2to3.fixes.fix_import'], +# ) diff --git a/transport/disk.py b/transport/disk.py index 5186698..d110f23 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -46,7 +46,7 @@ class DiskWriter(Writer): if 'name' in params: self.name = params['name']; else: - self.name = None + self.name = 'out.log' if os.path.exists(self.path) == False: os.mkdir(self.path) @@ -71,12 +71,15 @@ class DiskWriter(Writer): xchar = None if 'xchar' is not None: xchar = params['xchar'] - path = ''.join([self.path,os.sep,label]) - if os.path.exists(path) == False: - os.mkdir(path) ; + #path = ''.join([self.path,os.sep,label]) + path = ''.join([self.path,os.sep,self.name]) + #if os.path.exists(path) == False: + # os.mkdir(path) ; path = ''.join([path,os.sep,self.name]) f = open(path,'a') - row = self.format(row,xchar); + if isinstance(row,object): + row = json.dumps(row) + #row = self.format(row,xchar); f.write(row) f.close() - \ No newline at end of file + From aaad4003a94b66b07118b7f20d9f908cc6569be2 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 1 Dec 2019 21:32:47 -0600 Subject: [PATCH 018/271] bug fix --- transport/disk.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/transport/disk.py b/transport/disk.py index d110f23..be17550 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -20,8 +20,8 @@ class DiskReader(Reader) : return os.path.exists(self.path) def read(self,size=-1): """ - This function reads the rows from a designated location on disk - @param size number of rows to be read, -1 suggests all rows + This function reads the rows from a designated location on disk + @param size number of rows to be read, -1 suggests all rows """ f = open(self.path,'rU') @@ -47,8 +47,8 @@ class DiskWriter(Writer): self.name = params['name']; else: self.name = 'out.log' - if os.path.exists(self.path) == False: - os.mkdir(self.path) + # if os.path.exists(self.path) == False: + # os.mkdir(self.path) def isready(self): """ @@ -66,20 +66,20 @@ class DiskWriter(Writer): @param row row to be written """ - label = params['label'] + # label = params['label'] row = params['row'] - xchar = None - if 'xchar' is not None: - xchar = params['xchar'] + # xchar = None + # if 'xchar' is not None: + # xchar = params['xchar'] #path = ''.join([self.path,os.sep,label]) - path = ''.join([self.path,os.sep,self.name]) + # path = ''.join([self.path,os.sep,self.name]) #if os.path.exists(path) == False: # os.mkdir(path) ; - path = ''.join([path,os.sep,self.name]) - f = open(path,'a') - if isinstance(row,object): - row = json.dumps(row) + # path = ''.join([path,os.sep,self.name]) + f = open(self.path,'a') + if isinstance(row,object): + row = json.dumps(row) #row = self.format(row,xchar); - f.write(row) + f.write(row+"\n") f.close() From 081ed080d7247a245a6c047feb5ec2b6b4e69d31 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 1 Feb 2020 09:42:45 -0600 Subject: [PATCH 019/271] bug fixes for version 1.0.8, streamlining interface --- setup.py | 7 ++- transport/common.py | 136 +++++++++++++++++--------------------------- transport/couch.py | 89 +++++++++++++++++++---------- transport/disk.py | 57 +++++++++---------- transport/mongo.py | 79 +++++++++++++++++++++---- 5 files changed, 212 insertions(+), 156 deletions(-) diff --git a/setup.py b/setup.py index fbcde4c..5768b66 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,12 @@ import os import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-transport","version":"1.0.0","author":"The Phi Technology LLC","author_email":"info@the-phi.com","license":"MIT","packages":["transport"]} +args = { + "name":"data-transport", + "version":"1.0.8", + "author":"The Phi Technology LLC","author_email":"info@the-phi.com", + "license":"MIT", + "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3'] args["install_requires"] = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open'] args["url"] = "https://dev.the-phi.com/git/steve/data-transport.git" diff --git a/transport/common.py b/transport/common.py index e49fbdb..0ad7fd4 100644 --- a/transport/common.py +++ b/transport/common.py @@ -14,7 +14,8 @@ Requirements : pymongo boto couldant - +@TODO: + Enable read/writing to multiple reads/writes """ __author__ = 'The Phi Technology' import numpy as np @@ -22,107 +23,72 @@ import json import importlib # import couch # import mongo -class Reader: - def __init__(self): - self.nrows = 0 - self.xchar = None - - def row_count(self): - content = self.read() - return np.sum([1 for row in content]) - def delimiter(self,sample): +class IO: + def init(self,**args): """ - This function determines the most common delimiter from a subset of possible delimiters. - It uses a statistical approach (distribution) to guage the distribution of columns for a given delimiter - - :sample sample string/content expecting matrix i.e list of rows + This function enables attributes to be changed at runtime. Only the attributes defined in the class can be changed + Adding attributes will require sub-classing otherwise we may have an unpredictable class ... """ - - m = {',':[],'\t':[],'|':[],'\x3A':[]} - delim = list(m.keys()) - for row in sample: - for xchar in delim: - if row.split(xchar) > 1: - m[xchar].append(len(row.split(xchar))) - else: - m[xchar].append(0) - - - - # - # The delimiter with the smallest variance, provided the mean is greater than 1 - # This would be troublesome if there many broken records sampled - # - m = {id: np.var(m[id]) for id in list(m.keys()) if m[id] != [] and int(np.mean(m[id]))>1} - index = list(m.values()).index( min(m.values())) - xchar = list(m.keys())[index] - - return xchar - def col_count(self,sample): - """ - This function retirms the number of columns of a given sample - @pre self.xchar is not None + allowed = list(vars(self).keys()) + for field in args : + if field not in allowed : + continue + value = args[field] + setattr(self,field,value) +class Reader (IO): + """ + This class is an abstraction of a read functionalities of a data store + """ + def __init__(self): + pass + def meta(self): """ - - m = {} - i = 0 - - for row in sample: - row = self.format(row) - id = str(len(row)) - #id = str(len(row.split(self.xchar))) - - if id not in m: - m[id] = 0 - m[id] = m[id] + 1 - - index = list(m.values()).index( max(m.values()) ) - ncols = int(list(m.keys())[index]) - - - return ncols; - def format (self,row): + This function is intended to return meta-data associated with what has just been read + @return object of meta data information associated with the content of the store """ - This function will clean records of a given row by removing non-ascii characters - @pre self.xchar is not None + raise Exception ("meta function needs to be implemented") + def read(**args): """ - - if isinstance(row,list) == False: - # - # We've observed sometimes fields contain delimiter as a legitimate character, we need to be able to account for this and not tamper with the field values (unless necessary) - cols = self.split(row) - #cols = row.split(self.xchar) - else: - cols = row ; - return [ re.sub('[^\x00-\x7F,\n,\r,\v,\b,]',' ',col.strip()).strip().replace('"','') for col in cols] - - def split (self,row): + This function is intended to read the content of a store provided parameters to be used at the discretion of the subclass """ - This function performs a split of a record and tries to attempt to preserve the integrity of the data within i.e accounting for the double quotes. - @pre : self.xchar is not None - """ - - pattern = "".join(["(?:^|",self.xchar,")(\"(?:[^\"]+|\"\")*\"|[^",self.xchar,"]*)"]) - return re.findall(pattern,row.replace('\n','')) + raise Exception ("read function needs to be implemented") -class Writer: - +class Writer(IO): + def __init__(self): + self.cache = {"default":[]} + def log(self,**args): + self.cache[id] = args + def meta (self,id="default",**args): + raise Exception ("meta function needs to be implemented") def format(self,row,xchar): if xchar is not None and isinstance(row,list): return xchar.join(row)+'\n' elif xchar is None and isinstance(row,dict): row = json.dumps(row) return row - """ + def write(self,**args): + """ + This function will write content to a store given parameters to be used at the discretion of the sub-class + """ + raise Exception ("write function needs to be implemented") + + def archive(self): + """ It is important to be able to archive data so as to insure that growth is controlled Nothing in nature grows indefinitely neither should data being handled. - """ - def archive(self): - pass - def flush(self): + """ + raise Exception ("archive function needs to be implemented") + def close(self): + """ + This function will close the persistent storage connection/handler + """ pass - +class ReadWriter(Reader,Writer) : + """ + This class implements the read/write functions aggregated + """ + pass # class factory : # @staticmethod # def instance(**args): diff --git a/transport/couch.py b/transport/couch.py index 7aaf93e..9e9bf93 100644 --- a/transport/couch.py +++ b/transport/couch.py @@ -15,13 +15,13 @@ else: class Couch: """ This class is a wrapper for read/write against couchdb. The class captures common operations for read/write. - @param url host & port reference + @param url host & port reference default http://localhost:5984 @param doc user id involved @param dbname database name (target) """ def __init__(self,**args): - url = args['url'] - self.uid = args['doc'] + url = args['url'] if 'url' in args else 'http://localhost:5984' + self._id = args['doc'] dbname = args['dbname'] if 'username' not in args and 'password' not in args : self.server = cloudant.CouchDB(None,None,url=url) @@ -34,9 +34,9 @@ class Couch: # # @TODO Check if the database exists ... # - doc = cloudant.document.Document(self.dbase,self.uid) #self.dbase.get(self.uid) + doc = cloudant.document.Document(self.dbase,self._id) #self.dbase.get(self._id) if not doc.exists(): - doc = self.dbase.create_document({"_id":self.uid}) + doc = self.dbase.create_document({"_id":self._id}) doc.save() else: self.dbase = None @@ -51,8 +51,8 @@ class Couch: # At this point we are sure that the server is connected # We are also sure that the database actually exists # - doc = cloudant.document.Document(self.dbase,self.uid) - # q = self.dbase.all_docs(key=self.uid)['rows'] + doc = cloudant.document.Document(self.dbase,self._id) + # q = self.dbase.all_docs(key=self._id)['rows'] # if not q : if not doc.exists(): return False @@ -107,7 +107,7 @@ class CouchReader(Couch,Reader): # # We insure the document of the given user has the requested attachment. # # - # doc = self.dbase.get(self.uid) + # doc = self.dbase.get(self._id) # if '_attachments' in doc: # r = self.filename in doc['_attachments'].keys() @@ -120,8 +120,8 @@ class CouchReader(Couch,Reader): # # @TODO Need to get this working ... # - document = cloudant.document.Document(self.dbase,self.uid) - # content = self.dbase.fetch_attachment(self.uid,self.filename).split('\n') ; + document = cloudant.document.Document(self.dbase,self._id) + # content = self.dbase.fetch_attachment(self._id,self.filename).split('\n') ; content = self.get_attachment(self.filename) for row in content: yield row @@ -132,9 +132,9 @@ class CouchReader(Couch,Reader): else: return self.basic_read() def basic_read(self): - document = cloudant.document.Document(self.dbase,self.uid) + document = cloudant.document.Document(self.dbase,self._id) - # document = self.dbase.get(self.uid) + # document = self.dbase.get(self._id) if document.exists() : document.fetch() document = dict(document) @@ -157,32 +157,62 @@ class CouchWriter(Couch,Writer): """ Couch.__init__(self,**args) - - def write(self,**params): + def set (self,info): + document = cloudand.document.Document(self.dbase,self._id) + if document.exists() : + keys = list(set(document.keys()) - set(['_id','_rev','_attachments'])) + for id in keys : + document.field_set(document,id,None) + for id in args : + value = args[id] + document.field_set(document,id,value) + + document.save() + pass + else: + _document = dict({"_id":self._id},**args) + document.create_document(_document) + def write(self,info): """ write a given attribute to a document database - @param label scope of the row repair|broken|fixed|stats - @param row row to be written + @info object to be written to the to an attribute. this """ - # document = self.dbase.get(self.uid) - document = cloudant.document.Document(self.dbase,self.uid) #.get(self.uid) + # document = self.dbase.get(self._id) + document = cloudant.document.Document(self.dbase,self._id) #.get(self._id) if document.exists() is False : - document = self.dbase.create_document({"_id":self.uid}) - label = params['label'] - row = params['row'] - if label not in document : - document[label] = [] - document[label].append(row) + document = self.dbase.create_document({"_id":self._id}) + # label = params['label'] + # row = params['row'] + # if label not in document : + # document[label] = [] + # document[label].append(row) + for key in info : + if key in document and type(document[key]) == list : + document[key] += info[key] + else: + document[key] = info[key] + document.save() # self.dbase.bulk_docs([document]) # self.dbase.save_doc(document) - + + def upload(self,**args): + """ + :param name name of the file to be uploaded + :param data content of the file (binary or text) + :param content_type (default) + """ + mimetype = args['content_type'] if 'content_type' in args else 'text/plain' + document = cloudant.document.Document(self.dbase,self.uid) + document.put_attachment(self.dbase,args['filename'],mimetype,args['content']) + document.save() + def archive(self,params=None): """ This function will archive the document onto itself. """ - # document = self.dbase.all_docs(self.uid,include_docs=True) + # document = self.dbase.all_docs(self._id,include_docs=True) document = cloudant.document.Document(self.dbase,self.filename) document.fetch() content = {} @@ -196,8 +226,9 @@ class CouchWriter(Couch,Writer): # document= _doc now = str(datetime.today()) - name = '-'.join([document['_id'] , now,'.json']) + name = '-'.join([document['_id'] , now,'.json']) + self.upload(filename=name,data=content,content_type='application/json') # self.dbase.bulk_docs([document]) # self.dbase.put_attachment(document,content,name,'application/json') - document.put_attachment(self.dbase,name,'application/json',content) - document.save() + # document.put_attachment(self.dbase,name,'application/json',content) + # document.save() diff --git a/transport/disk.py b/transport/disk.py index be17550..a051045 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -14,8 +14,8 @@ class DiskReader(Reader) : """ Reader.__init__(self) - self.path = params['path'] ; - + self.path = params['path'] ; + self.delimiter = params['delimiter'] if 'delimiter' in params else None def isready(self): return os.path.exists(self.path) def read(self,size=-1): @@ -31,55 +31,54 @@ class DiskReader(Reader) : i += 1 if size == i: break + if self.delimiter : + yield row.split(self.char) yield row f.close() class DiskWriter(Writer): """ - This function writes output to disk in a designated location + This function writes output to disk in a designated location. The function will write a text to a text file + - If a delimiter is provided it will use that to generate a xchar-delimited file + - If not then the object will be dumped as is """ def __init__(self,**params): + Writer.__init__(self) + self.cache['meta'] = {'cols':0,'rows':0,'delimiter':None} if 'path' in params: self.path = params['path'] else: - self.path = None - if 'name' in params: - self.name = params['name']; - else: - self.name = 'out.log' + self.path = 'data-transport.log' + self.delimiter = params['delimiter'] if 'delimiter' in params else None + # if 'name' in params: + # self.name = params['name']; + # else: + # self.name = 'data-transport.log' # if os.path.exists(self.path) == False: # os.mkdir(self.path) - + def meta(self): + return self.cache['meta'] def isready(self): """ This function determines if the class is ready for execution or not i.e it determines if the preconditions of met prior execution """ - - p = self.path is not None and os.path.exists(self.path) - q = self.name is not None - return p and q - def write(self,**params): + return True + # p = self.path is not None and os.path.exists(self.path) + # q = self.name is not None + # return p and q + def format (self,row): + self.cache['meta']['cols'] += len(row) if isinstance(row,list) else len(row.keys()) + self.cache['meta']['rows'] += 1 + return (self.delimiter.join(row) if self.delimiter else json.dumps(row))+"\n" + def write(self,info): """ This function writes a record to a designated file @param label @param row row to be written """ - - # label = params['label'] - row = params['row'] - # xchar = None - # if 'xchar' is not None: - # xchar = params['xchar'] - #path = ''.join([self.path,os.sep,label]) - # path = ''.join([self.path,os.sep,self.name]) - #if os.path.exists(path) == False: - # os.mkdir(path) ; - # path = ''.join([path,os.sep,self.name]) f = open(self.path,'a') - if isinstance(row,object): - row = json.dumps(row) - #row = self.format(row,xchar); - f.write(row+"\n") + f.write(self.format(info)) f.close() + diff --git a/transport/mongo.py b/transport/mongo.py index e363008..ce7165d 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -4,7 +4,12 @@ Steve L. Nyemba, The Phi Technology LLC This file is a wrapper around mongodb for reading/writing content against a mongodb server and executing views (mapreduce) """ -from pymongo import MongoClient +from pymongo import MongoClient +from bson.objectid import ObjectId +from bson.binary import Binary +import json +from datetime import datetime +import gridfs # from transport import Reader,Writer import sys if sys.version_info[0] > 2 : @@ -19,11 +24,11 @@ class Mongo : def __init__(self,**args): """ :dbname database name/identifier - :host host and port of the database + :host host and port of the database by default localhost:27017 :username username for authentication :password password for current user """ - host = args['host'] + host = args['host'] if 'host' in args else 'localhost:27017' if 'user' in args and 'password' in args: self.client = MongoClient(host, @@ -31,7 +36,7 @@ class Mongo : password=args['password'] , authMechanism='SCRAM-SHA-256') else: - self.client = MongoClient() + self.client = MongoClient(host) self.uid = args['doc'] #-- document identifier self.dbname = args['dbname'] @@ -62,17 +67,67 @@ class MongoWriter(Mongo,Writer): """ def __init__(self,**args): Mongo.__init__(self,**args) - def write(self,**args): + def upload(self,**args) : + """ + This function will upload a file to the current database (using GridFS) + :param data binary stream/text to be stored + :param filename filename to be used + :param encoding content_encoding (default utf-8) + + """ + if 'encoding' not in args : + args['encoding'] = 'utf-8' + gfs = GridFS(self.db) + gfs.put(**args) + + def archive(self): + """ + This function will archive documents to the + """ + collection = self.db[self.uid] + rows = list(collection.find()) + for row in rows : + if type(row['_id']) == ObjectId : + row['_id'] = str(row['_id']) + stream = Binary(json.dumps(collection).encode()) + collection.delete_many({}) + now = "-".join([str(datetime.now().year()),str(datetime.now().month), str(datetime.now().day)]) + name = ".".join([self.uid,'archive',now])+".json" + description = " ".join([self.uid,'archive',str(len(rows))]) + self.upload(filename=name,data=stream,description=description,content_type='application/json') + # gfs = GridFS(self.db) + # gfs.put(filename=name,description=description,data=stream,encoding='utf-8') + # self.write({{"filename":name,"file":stream,"description":descriptions}}) + + + pass + def write(self,info): + """ + This function will write to a given collection i.e add a record to a collection (no updates) + @param info new record in the collection to be added + """ # document = self.db[self.uid].find() collection = self.db[self.uid] - if type(args['row']) == list : - self.db[self.uid].insert_many(args['row']) + # if type(info) == list : + # self.db[self.uid].insert_many(info) + # else: + if (type(info) == list) : + self.db[self.uid].insert_many(info) else: - self.db[self.uid].insert_one(args['row']) + self.db[self.uid].insert_one(info) def set(self,document): + """ + if no identifier is provided the function will delete the entire collection and set the new document. + Please use this function with great care (archive the content first before using it... for safety) + """ + collection = self.db[self.uid] - if collection.count_document() > 0 : - collection.delete({_id:self.uid}) - - collecton.update_one({"_id":self.uid},document,True) + if collection.count_document() > 0 and '_id' in document: + id = document['_id'] + del document['_id'] + collection.find_one_and_replace({'_id':id},document) + else: + collection.delete_many({}) + self.write(info) + # collecton.update_one({"_id":self.uid},document,True) From c268a117c291741b916cee6304c031edf0b461d6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 5 Feb 2020 15:06:35 -0600 Subject: [PATCH 020/271] bug fixes with upgraded version of pika --- setup.py | 2 +- transport/queue.py | 143 ++++++++++++++++++++++++++------------------- 2 files changed, 83 insertions(+), 62 deletions(-) diff --git a/setup.py b/setup.py index 5768b66..b95c6d2 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.0.8", + "version":"1.0.9", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/queue.py b/transport/queue.py index eccde1e..bfb4acf 100644 --- a/transport/queue.py +++ b/transport/queue.py @@ -25,10 +25,33 @@ class MessageQueue: :qid identifier of the queue """ def __init__(self,**params): - self.host= params['host'] - self.uid = params['xid'] - self.qid = params['qid'] + self.host= 'localhost' if 'host' not in params else params['host'] #-- location of the queue server + self.port= 5672 if 'port' not in params else params['port'] + self.virtual_host = '/' if 'vhost' not in params else params['vhost'] + self.exchange = params['exchange'] if 'exchange' in params else 'amq.direct' #-- exchange + self.queue = params['queue'] + self.connection = None + self.channel = None + + self.credentials= pika.PlainCredentials('guest','guest') + if 'username' in params : + self.credentials = pika.PlainCredentials( + params['username'], + ('' if 'password' not in params else params['password']) + ) + def init(self,label=None): + properties = pika.ConnectionParameters(host=self.host,port=self.port,virtual_host=self.virtual_host,credentials=self.credentials) + self.connection = pika.BlockingConnection(properties) + self.channel = self.connection.channel() + self.info = self.channel.exchange_declare(exchange=self.exchange,exchange_type='direct',durable=True) + if label is None: + self.qhandler = self.channel.queue_declare(queue=self.queue,durable=True) + else: + self.qhandler = self.channel.queue_declare(queue=label,durable=True) + + self.channel.queue_bind(exchange=self.exchange,queue=self.qhandler.method.queue) + def isready(self): #self.init() resp = self.connection is not None and self.connection.is_open @@ -48,22 +71,13 @@ class QueueWriter(MessageQueue,Writer): """ def __init__(self,**params): #self.host= params['host'] - #self.uid = params['uid'] - #self.qid = params['queue'] + #self.exchange = params['uid'] + #self.queue = params['queue'] MessageQueue.__init__(self,**params); + self.init() - def init(self,label=None): - properties = pika.ConnectionParameters(host=self.host) - self.connection = pika.BlockingConnection(properties) - self.channel = self.connection.channel() - self.info = self.channel.exchange_declare(exchange=self.uid,type='direct',durable=True) - if label is None: - self.qhandler = self.channel.queue_declare(queue=self.qid,durable=True) - else: - self.qhandler = self.channel.queue_declare(queue=label,durable=True) - - self.channel.queue_bind(exchange=self.uid,queue=self.qhandler.method.queue) + @@ -72,37 +86,37 @@ class QueueWriter(MessageQueue,Writer): @param object object to be written (will be converted to JSON) @TODO: make this less chatty """ - def write(self,**params): - xchar = None - if 'xchar' in params: - xchar = params['xchar'] - object = self.format(params['row'],xchar) + def write(self,data,_type='text/plain'): + # xchar = None + # if 'xchar' in params: + # xchar = params['xchar'] + # object = self.format(params['row'],xchar) - label = params['label'] - self.init(label) - _mode = 2 - if isinstance(object,str): - stream = object - _type = 'text/plain' - else: - stream = json.dumps(object) - if 'type' in params : - _type = params['type'] - else: - _type = 'application/json' - + # label = params['label'] + # self.init(label) + # _mode = 2 + # if isinstance(object,str): + # stream = object + # _type = 'text/plain' + # else: + # stream = json.dumps(object) + # if 'type' in params : + # _type = params['type'] + # else: + # _type = 'application/json' + stream = json.dumps(data) if isinstance(data,dict) else data self.channel.basic_publish( - exchange=self.uid, - routing_key=label, + exchange=self.exchange, + routing_key=self.queue, body=stream, - properties=pika.BasicProperties(content_type=_type,delivery_mode=_mode) + properties=pika.BasicProperties(content_type=_type,delivery_mode=2) ); - self.close() + # self.close() - def flush(self,label): - self.init(label) + def flush(self): + self.init() _mode = 1 #-- Non persistent - self.channel.queue_delete( queue=label); + self.channel.queue_delete( queue=self.queue); self.close() class QueueReader(MessageQueue,Reader): @@ -119,23 +133,24 @@ class QueueReader(MessageQueue,Reader): """ #self.host= params['host'] - #self.uid = params['uid'] - #self.qid = params['qid'] + #self.exchange = params['uid'] + #self.queue = params['qid'] MessageQueue.__init__(self,**params); + self.init() if 'durable' in params : self.durable = True else: self.durable = False self.size = -1 self.data = {} - def init(self,qid): + # def init(self,qid): - properties = pika.ConnectionParameters(host=self.host) - self.connection = pika.BlockingConnection(properties) - self.channel = self.connection.channel() - self.channel.exchange_declare(exchange=self.uid,type='direct',durable=True) + # properties = pika.ConnectionParameters(host=self.host) + # self.connection = pika.BlockingConnection(properties) + # self.channel = self.connection.channel() + # self.channel.exchange_declare(exchange=self.exchange,type='direct',durable=True) - self.info = self.channel.queue_declare(queue=qid,durable=True) + # self.info = self.channel.queue_declare(queue=qid,durable=True) def callback(self,channel,method,header,stream): @@ -175,9 +190,9 @@ class QueueReader(MessageQueue,Reader): # We enabled the reader to be able to read from several queues (sequentially for now) # The qid parameter will be an array of queues the reader will be reading from # - if isinstance(self.qid,str) : - self.qid = [self.qid] - for qid in self.qid: + if isinstance(self.queue,str) : + self.queue = [self.queue] + for qid in self.queue: self.init(qid) # r[qid] = [] @@ -193,19 +208,25 @@ class QueueReader(MessageQueue,Reader): return self.data class QueueListener(QueueReader): - def init(self,qid): - properties = pika.ConnectionParameters(host=self.host) - self.connection = pika.BlockingConnection(properties) - self.channel = self.connection.channel() - self.channel.exchange_declare(exchange=self.uid,type='direct',durable=True ) + """ + This class is designed to have an active listener (worker) against a specified Exchange/Queue + It is initialized as would any other object and will require a callback function to address the objects returned. + """ + # def init(self,qid): + # properties = pika.ConnectionParameters(host=self.host) + # self.connection = pika.BlockingConnection(properties) + # self.channel = self.connection.channel() + # self.channel.exchange_declare(exchange=self.exchange,type='direct',durable=True ) - self.info = self.channel.queue_declare(passive=True,exclusive=True,queue=qid) + # self.info = self.channel.queue_declare(passive=True,exclusive=True,queue=qid) - self.channel.queue_bind(exchange=self.uid,queue=self.info.method.queue,routing_key=qid) + # self.channel.queue_bind(exchange=self.exchange,queue=self.info.method.queue,routing_key=qid) #self.callback = callback + def callback(self,channel,method,header,stream) : + raise Exception("....") def read(self): - self.init(self.qid) - self.channel.basic_consume(self.callback,queue=self.qid,no_ack=True); + self.init(self.queue) + self.channel.basic_consume(self.queue,self.callback,auto_ack=True); self.channel.start_consuming() From a16b969b699f402eb32636a25bbb94c0e96cf19f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 5 Feb 2020 18:14:30 -0600 Subject: [PATCH 021/271] bug fix & aliasing --- transport/queue.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/transport/queue.py b/transport/queue.py index bfb4acf..e16159a 100644 --- a/transport/queue.py +++ b/transport/queue.py @@ -33,6 +33,9 @@ class MessageQueue: self.connection = None self.channel = None + self.name = self.__class__.__name__.lower() if 'name' not in params else 'wtf' + + self.credentials= pika.PlainCredentials('guest','guest') if 'username' in params : self.credentials = pika.PlainCredentials( @@ -57,6 +60,8 @@ class MessageQueue: resp = self.connection is not None and self.connection.is_open self.close() return resp + def finalize(self): + pass def close(self): if self.connection.is_closed == False : self.channel.close() @@ -81,12 +86,13 @@ class QueueWriter(MessageQueue,Writer): - """ + + def write(self,data,_type='text/plain'): + """ This function writes a stream of data to the a given queue @param object object to be written (will be converted to JSON) @TODO: make this less chatty - """ - def write(self,data,_type='text/plain'): + """ # xchar = None # if 'xchar' in params: # xchar = params['xchar'] @@ -207,11 +213,14 @@ class QueueReader(MessageQueue,Reader): # r[qid].append( self.data) return self.data -class QueueListener(QueueReader): +class QueueListener(MessageQueue): """ This class is designed to have an active listener (worker) against a specified Exchange/Queue It is initialized as would any other object and will require a callback function to address the objects returned. """ + def __init__(self,**args): + MessageQueue.__init__(self,**args) + self.listen = self.read # def init(self,qid): # properties = pika.ConnectionParameters(host=self.host) # self.connection = pika.BlockingConnection(properties) @@ -222,11 +231,18 @@ class QueueListener(QueueReader): # self.channel.queue_bind(exchange=self.exchange,queue=self.info.method.queue,routing_key=qid) #self.callback = callback + + def finalize(self,channel,ExceptionReason): + pass + def callback(self,channel,method,header,stream) : raise Exception("....") def read(self): self.init(self.queue) + self.channel.basic_consume(self.queue,self.callback,auto_ack=True); self.channel.start_consuming() + + From 172b609b0ef4a4fa33ee2a82050f74b98a5ae16d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 10 Feb 2020 15:48:23 -0600 Subject: [PATCH 022/271] bug fix --- transport/queue.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/queue.py b/transport/queue.py index e16159a..f88fcd0 100644 --- a/transport/queue.py +++ b/transport/queue.py @@ -58,7 +58,7 @@ class MessageQueue: def isready(self): #self.init() resp = self.connection is not None and self.connection.is_open - self.close() + # self.close() return resp def finalize(self): pass From 1bb7c1bd0a5c9ff6c119b5388ca26f2d4d692555 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 2 Mar 2020 11:33:35 -0600 Subject: [PATCH 023/271] Bug fix with queueReader --- setup.py | 2 +- transport/queue.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index b95c6d2..cc1bb64 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.0.9", + "version":"1.1.0", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/queue.py b/transport/queue.py index f88fcd0..dcc2b57 100644 --- a/transport/queue.py +++ b/transport/queue.py @@ -172,7 +172,7 @@ class QueueReader(MessageQueue,Reader): r = stream - qid = self.info.method.queue + qid = self.qhandler.method.queue if qid not in self.data : self.data[qid] = [] @@ -202,7 +202,7 @@ class QueueReader(MessageQueue,Reader): self.init(qid) # r[qid] = [] - if self.info.method.message_count > 0: + if self.qhandler.method.message_count > 0: self.channel.basic_consume(self.callback,queue=qid,no_ack=False); self.channel.start_consuming() From 834dc80e794de6693406c80e053d28075f63f493 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 4 Mar 2020 02:23:21 -0600 Subject: [PATCH 024/271] bug fix (updating pika interface) --- setup.py | 2 +- transport/queue.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index cc1bb64..af06eda 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.1.0", + "version":"1.1.2", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/queue.py b/transport/queue.py index dcc2b57..485b771 100644 --- a/transport/queue.py +++ b/transport/queue.py @@ -142,7 +142,7 @@ class QueueReader(MessageQueue,Reader): #self.exchange = params['uid'] #self.queue = params['qid'] MessageQueue.__init__(self,**params); - self.init() + # self.init() if 'durable' in params : self.durable = True else: @@ -198,13 +198,14 @@ class QueueReader(MessageQueue,Reader): # if isinstance(self.queue,str) : self.queue = [self.queue] + for qid in self.queue: self.init(qid) # r[qid] = [] if self.qhandler.method.message_count > 0: - self.channel.basic_consume(self.callback,queue=qid,no_ack=False); + self.channel.basic_consume(queue=qid,on_message_callback=self.callback,auto_ack=False); self.channel.start_consuming() else: From d5ba648abf1925acc5dceeb029cc75a7ec8cf507 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 17 May 2020 21:57:18 -0500 Subject: [PATCH 025/271] support for filters in read --- transport/__init__.py | 5 ++- transport/common.py | 2 +- transport/disk.py | 10 +++-- transport/mongo.py | 7 ++-- transport/{queue.py => rabbitmq.py} | 4 +- transport/s3.py | 60 ++++++++++++++++++++++++----- 6 files changed, 68 insertions(+), 20 deletions(-) rename transport/{queue.py => rabbitmq.py} (98%) diff --git a/transport/__init__.py b/transport/__init__.py index 2502240..e5a3418 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -50,10 +50,11 @@ import sys if sys.version_info[0] > 2 : from transport.common import Reader, Writer #, factory from transport import disk - from transport import queue as queue + + from transport import s3 as s3 + from transport import rabbitmq as queue from transport import couch as couch from transport import mongo as mongo - from transport import s3 as s3 else: from common import Reader, Writer #, factory import disk diff --git a/transport/common.py b/transport/common.py index 0ad7fd4..6e595ae 100644 --- a/transport/common.py +++ b/transport/common.py @@ -47,7 +47,7 @@ class Reader (IO): @return object of meta data information associated with the content of the store """ raise Exception ("meta function needs to be implemented") - def read(**args): + def read(self,**args): """ This function is intended to read the content of a store provided parameters to be used at the discretion of the subclass """ diff --git a/transport/disk.py b/transport/disk.py index a051045..00c4a87 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -1,5 +1,9 @@ import os -from .__init__ import Reader,Writer +import sys +if sys.version_info[0] > 2 : + from transport.common import Reader, Writer #, factory +else: + from common import Reader,Writer import json class DiskReader(Reader) : @@ -18,12 +22,12 @@ class DiskReader(Reader) : self.delimiter = params['delimiter'] if 'delimiter' in params else None def isready(self): return os.path.exists(self.path) - def read(self,size=-1): + def read(self,**args): """ This function reads the rows from a designated location on disk @param size number of rows to be read, -1 suggests all rows """ - + size = -1 if 'size' not in args else int(args['size']) f = open(self.path,'rU') i = 1 for row in f: diff --git a/transport/mongo.py b/transport/mongo.py index ce7165d..48c1bc8 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -39,7 +39,7 @@ class Mongo : self.client = MongoClient(host) self.uid = args['doc'] #-- document identifier - self.dbname = args['dbname'] + self.dbname = args['dbname'] if 'db' in args else args['db'] self.db = self.client[self.dbname] def isready(self): @@ -53,9 +53,10 @@ class MongoReader(Mongo,Reader): """ def __init__(self,**args): Mongo.__init__(self,**args) - def read(self,size=-1): + def read(self,**args): collection = self.db[self.uid] - return collection.find({}) + _filter = args['filter'] if 'filter' in args else {} + return collection.find(_filter) def view(self,**args): """ This function is designed to execute a view (map/reduce) operation diff --git a/transport/queue.py b/transport/rabbitmq.py similarity index 98% rename from transport/queue.py rename to transport/rabbitmq.py index 485b771..41d016a 100644 --- a/transport/queue.py +++ b/transport/rabbitmq.py @@ -183,7 +183,7 @@ class QueueReader(MessageQueue,Reader): if self.size == len(self.data[qid]) or len(self.data[qid]) == self.info.method.message_count: self.close() - def read(self,size=-1): + def read(self,**args): """ This function will read, the first message from a queue @TODO: @@ -191,7 +191,7 @@ class QueueReader(MessageQueue,Reader): Have the number of messages retrieved be specified by size (parameter) """ r = {} - self.size = size + self.size = -1 if 'size' in args else int(args['size']) # # We enabled the reader to be able to read from several queues (sequentially for now) # The qid parameter will be an array of queues the reader will be reading from diff --git a/transport/s3.py b/transport/s3.py index 19d98b6..1a67317 100644 --- a/transport/s3.py +++ b/transport/s3.py @@ -6,6 +6,8 @@ This file is a wrapper around s3 bucket provided by AWS for reading and writing """ from datetime import datetime import boto +from boto.s3.connection import S3Connection, OrdinaryCallingFormat +import numpy as np import botocore from smart_open import smart_open import sys @@ -14,13 +16,14 @@ if sys.version_info[0] > 2 : else: from common import Reader, Writer import json - +from io import StringIO +import json class s3 : """ @TODO: Implement a search function for a file given a bucket?? """ - def __init__(self,args) : + def __init__(self,**args) : """ This function will extract a file or set of files from s3 bucket provided @param access_key @@ -29,18 +32,39 @@ class s3 : @param filter filename or filtering elements """ try: - self.s3 = boto.connect_s3(args['access_key'],args['secret_key']) + self.s3 = S3Connection(args['access_key'],args['secret_key'],calling_format=OrdinaryCallingFormat()) self.bucket = self.s3.get_bucket(args['bucket'].strip(),validate=False) if 'bucket' in args else None # self.path = args['path'] self.filter = args['filter'] if 'filter' in args else None self.filename = args['file'] if 'file' in args else None + self.bucket_name = args['bucket'] if 'bucket' in args else None except Exception as e : self.s3 = None self.bucket = None print (e) + def meta(self,**args): + """ + :name name of the bucket + """ + info = self.list(**args) + [item.open() for item in info] + return [{"name":item.name,"size":item.size} for item in info] + def list(self,**args): + """ + This function will list the content of a bucket, the bucket must be provided by the name + :name name of the bucket + """ + return list(self.s3.get_bucket(args['name']).list()) + def buckets(self): + # + # This function will return all buckets, not sure why but it should be used cautiously + # based on why the s3 infrastructure is used + # + return [item.name for item in self.s3.get_all_buckets()] + # def buckets(self): pass # """ @@ -56,8 +80,8 @@ class s3Reader(s3,Reader) : - stream content if file is Not None @TODO: support read from all buckets, think about it """ - def __init__(self,args) : - s3.__init__(self,args) + def __init__(self,**args) : + s3.__init__(self,**args) def files(self): r = [] try: @@ -80,14 +104,32 @@ class s3Reader(s3,Reader) : break yield line count += 1 - def read(self,limit=-1) : + def read(self,**args) : if self.filename is None : # # returning the list of files because no one file was specified. return self.files() else: - return self.stream(10) + limit = args['size'] if 'size' in args else -1 + return self.stream(limit) class s3Writer(s3,Writer) : - def __init__(self,args) : - s3.__init__(self,args) + + def __init__(self,args) : + s3.__init__(self,args) + def mkdir(self,name): + """ + This function will create a folder in a bucket + :name name of the folder + """ + self.s3.put_object(Bucket=self.bucket_name,key=(name+'/')) + def write(self,content): + file = StringIO(content.decode("utf8")) + self.s3.upload_fileobj(file,self.bucket_name,self.filename) + pass + +if __name__ == '__main__' : + p = {'access_key':'AKIAJO7KII27XH3TCPJQ','secret_key':'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'} + reader = s3Reader(**p) + buckets = reader.buckets() + print(reader.list(name = buckets[0])) From 21ef8421378428dcff86a98fcdab10cd875a9e00 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 17 May 2020 22:53:54 -0500 Subject: [PATCH 026/271] bug fix --- transport/couch.py | 2 +- transport/mongo.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/transport/couch.py b/transport/couch.py index 9e9bf93..8e02a4e 100644 --- a/transport/couch.py +++ b/transport/couch.py @@ -126,7 +126,7 @@ class CouchReader(Couch,Reader): for row in content: yield row - def read(self,size=-1): + def read(self,**args): if self.filename is not None: self.stream() else: diff --git a/transport/mongo.py b/transport/mongo.py index 48c1bc8..72a439c 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -39,7 +39,7 @@ class Mongo : self.client = MongoClient(host) self.uid = args['doc'] #-- document identifier - self.dbname = args['dbname'] if 'db' in args else args['db'] + self.dbname = args['dbname'] if 'dbname' in args else args['db'] self.db = self.client[self.dbname] def isready(self): From 617f829e1eb94f1638bec98e54b6e8808db55b69 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 27 Jul 2020 21:47:02 -0500 Subject: [PATCH 027/271] bug fix:sqlitewriter --- setup.py | 2 +- transport/disk.py | 89 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 85 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index af06eda..8a6a504 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.1.2", + "version":"1.1.6", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/disk.py b/transport/disk.py index 00c4a87..3fbaecc 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -5,13 +5,15 @@ if sys.version_info[0] > 2 : else: from common import Reader,Writer import json +from threading import Lock +import sqlite3 class DiskReader(Reader) : """ This class is designed to read data from disk (location on hard drive) @pre : isready() == True """ - + def __init__(self,**params): """ @param path absolute path of the file to be read @@ -40,12 +42,13 @@ class DiskReader(Reader) : yield row f.close() class DiskWriter(Writer): + """ This function writes output to disk in a designated location. The function will write a text to a text file - If a delimiter is provided it will use that to generate a xchar-delimited file - If not then the object will be dumped as is """ - + THREAD_LOCK = Lock() def __init__(self,**params): Writer.__init__(self) self.cache['meta'] = {'cols':0,'rows':0,'delimiter':None} @@ -81,8 +84,84 @@ class DiskWriter(Writer): @param label @param row row to be written """ - f = open(self.path,'a') - f.write(self.format(info)) - f.close() + try: + DiskWriter.THREAD_LOCK.acquire() + f = open(self.path,'a') + if self.delimiter : + if type(info) == list : + for row in info : + f.write(self.format(row)) + else: + f.write(self.format(info)) + else: + if not type(info) == str : + f.write(json.dumps(info)) + else: + f.write(info) + f.close() + except Exception as e: + # + # Not sure what should be done here ... + pass + finally: + DiskWriter.THREAD_LOCK.release() +class SQLiteWriter(DiskWriter) : + def __init__(self,**args): + """ + :path + :fields json|csv + """ + DiskWriter.__init__(self,**args) + self.table = args['table'] + + self.conn = sqlite3.connect(self.path,isolation_level=None) + self.conn.row_factory = sqlite3.Row + self.fields = args['fields'] if 'fields' in args else [] + if self.fields and not self.isready(): + self.init(self.fields) + + def init(self,fields): + self.fields = fields; + sql = " ".join(["CREATE TABLE IF NOT EXISTS ",self.table," (", ",".join(self.fields),")"]) + + cursor = self.conn.cursor() + cursor.execute(sql) + cursor.close() + self.conn.commit() + def isready(self): + try: + sql = "SELECT count(*) FROM sqlite_master where name=':table'" + sql = sql.replace(":table",self.table) + cursor = self.conn.cursor() + + r = cursor.execute(sql) + r = r.fetchall() + cursor.close() + + return r[0][0] + except Exception as e: + pass + return 0 + # + # If the table doesn't exist we should create it + # + def write(self,info): + """ + """ + + if not self.fields : + self.init(list(info.keys())) + + if type(info) != list : + info = [info] + cursor = self.conn.cursor() + + + sql = " " .join(["INSERT INTO ",self.table,"(", ",".join(self.fields) ,")", "values(':values')"]) + for row in info : + cursor.execute(sql.replace(":values",json.dumps(row))) + # self.conn.commit() + # print (sql) + From 4a985e82dc693a199a597285149b898d90b367c7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 11 Aug 2020 16:42:21 -0500 Subject: [PATCH 028/271] bug fix with diskwriter (adding return carriage) --- transport/disk.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/transport/disk.py b/transport/disk.py index 3fbaecc..02d03c6 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -161,7 +161,9 @@ class SQLiteWriter(DiskWriter) : sql = " " .join(["INSERT INTO ",self.table,"(", ",".join(self.fields) ,")", "values(':values')"]) for row in info : - cursor.execute(sql.replace(":values",json.dumps(row))) + stream = json.dumps(row) + stream = stream.replace("'","''") + cursor.execute(sql.replace(":values",stream) ) # self.conn.commit() # print (sql) From 5676c4991ffe371104aeba3239526d4acdd7544b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 11 Aug 2020 17:20:46 -0500 Subject: [PATCH 029/271] json object writing ... --- setup.py | 2 +- transport/disk.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 8a6a504..60340ee 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.1.6", + "version":"1.1.8", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/disk.py b/transport/disk.py index 02d03c6..0639086 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -95,7 +95,7 @@ class DiskWriter(Writer): f.write(self.format(info)) else: if not type(info) == str : - f.write(json.dumps(info)) + f.write(json.dumps(info)+"\n") else: f.write(info) f.close() From cbf641a986e6b6f4b43dd6495456b2cd5a5ff775 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 22 Sep 2020 13:30:04 -0500 Subject: [PATCH 030/271] bug fix with reader --- transport/disk.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/transport/disk.py b/transport/disk.py index 0639086..c36489d 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -105,6 +105,24 @@ class DiskWriter(Writer): pass finally: DiskWriter.THREAD_LOCK.release() +class SQLiteReader (DiskReader): + def __init__(self,**args) + DiskReader.__init__(self,**args) + self.conn = sqlite3.connect(self.path,isolation_level=None) + self.conn.row_factory = sqlite3.Row + self.table = args['table'] + def read(self,**args): + if 'sql' in args : + sql = args['sql'] + else if 'filter' in args : + sql = "SELECT :fields FROM ",self.table, "WHERE (:filter)".replace(":filter",args['filter']) + sql = sql.replace(":fields",args['fields']) if 'fields' in args else sql.replace(":fields","*") + return = pd.read_sql(sql,self.conn) + def close(self): + try: + self.conn.close() + except Exception as e : + pass class SQLiteWriter(DiskWriter) : def __init__(self,**args): From 40f8b0420be7cb0f001080fd6dae732c170a2763 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 22 Sep 2020 13:31:47 -0500 Subject: [PATCH 031/271] bug fix with reader --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 60340ee..0745b4e 100644 --- a/setup.py +++ b/setup.py @@ -8,11 +8,11 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.1.8", + "version":"1.2.0", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} -args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3'] +args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] args["install_requires"] = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open'] args["url"] = "https://dev.the-phi.com/git/steve/data-transport.git" From e4a1ef8dd7644dceb6da3478898f58212774607f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 26 Sep 2020 16:53:33 -0500 Subject: [PATCH 032/271] bug fix --- setup.py | 2 +- transport/disk.py | 6 +++--- transport/mongo.py | 32 +++++++++++++++++++++++++++++--- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 0745b4e..3676055 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.2.0", + "version":"1.2.2", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/disk.py b/transport/disk.py index c36489d..25ff336 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -106,7 +106,7 @@ class DiskWriter(Writer): finally: DiskWriter.THREAD_LOCK.release() class SQLiteReader (DiskReader): - def __init__(self,**args) + def __init__(self,**args): DiskReader.__init__(self,**args) self.conn = sqlite3.connect(self.path,isolation_level=None) self.conn.row_factory = sqlite3.Row @@ -114,10 +114,10 @@ class SQLiteReader (DiskReader): def read(self,**args): if 'sql' in args : sql = args['sql'] - else if 'filter' in args : + elif 'filter' in args : sql = "SELECT :fields FROM ",self.table, "WHERE (:filter)".replace(":filter",args['filter']) sql = sql.replace(":fields",args['fields']) if 'fields' in args else sql.replace(":fields","*") - return = pd.read_sql(sql,self.conn) + return pd.read_sql(sql,self.conn) def close(self): try: self.conn.close() diff --git a/transport/mongo.py b/transport/mongo.py index 72a439c..2fa0f60 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -17,6 +17,7 @@ if sys.version_info[0] > 2 : else: from common import Reader, Writer import json +import re class Mongo : """ Basic mongodb functions are captured here @@ -54,9 +55,34 @@ class MongoReader(Mongo,Reader): def __init__(self,**args): Mongo.__init__(self,**args) def read(self,**args): - collection = self.db[self.uid] - _filter = args['filter'] if 'filter' in args else {} - return collection.find(_filter) + if 'mongo' in args : + # + # @TODO: + cmd = args['mongo'] + r = [] + out = self.db.command(cmd) + #@TODO: consider using a yield (generator) works wonders + while True : + if 'cursor' in out : + key = 'firstBatch' if 'firstBatch' in out['cursor'] else 'nextBatch' + else: + key = 'n' + if 'cursor' in out and out['cursor'][key] : + r += list(out['cursor'][key]) + elif out[key]: + r.append (out[key]) + # yield out['cursor'][key] + if key not in ['firstBatch','nextBatch'] or ('cursor' in out and out['cursor']['id'] == 0) : + break + else: + out = self.db.command({"getMore":out['cursor']['id'],"collection":out['cursor']['ns'].split(".")[-1]}) + + + return r + else: + collection = self.db[self.uid] + _filter = args['filter'] if 'filter' in args else {} + return collection.find(_filter) def view(self,**args): """ This function is designed to execute a view (map/reduce) operation From 18d6ff90feabece973529f0670ab9b65e538b900 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 6 Oct 2020 15:26:06 -0500 Subject: [PATCH 033/271] close function for mongodb --- transport/mongo.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transport/mongo.py b/transport/mongo.py index 2fa0f60..e86f1e8 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -47,6 +47,8 @@ class Mongo : p = self.dbname in self.client.list_database_names() q = self.uid in self.client[self.dbname].list_collection_names() return p and q + def close(self): + self.db.close() class MongoReader(Mongo,Reader): """ From e70802ef005e8c58303eea9fa84a6257aedc048a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 8 Oct 2020 17:14:35 -0500 Subject: [PATCH 034/271] mongo thread/process safe? --- transport/mongo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/mongo.py b/transport/mongo.py index e86f1e8..04e2cad 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -37,7 +37,7 @@ class Mongo : password=args['password'] , authMechanism='SCRAM-SHA-256') else: - self.client = MongoClient(host) + self.client = MongoClient(host,maxPoolSize=10000) self.uid = args['doc'] #-- document identifier self.dbname = args['dbname'] if 'dbname' in args else args['db'] From cbed09ea08fece4f4f61880e906aabdd5cb22d18 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 8 Oct 2020 17:15:14 -0500 Subject: [PATCH 035/271] version 1.2.4 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3676055..1d628d4 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.2.2", + "version":"1.2.4", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} From 9eb46954cd409c1feb06e89558329888386cd3a5 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 4 Nov 2020 14:26:46 -0600 Subject: [PATCH 036/271] bug fixes: mongodb and s3 bucket --- setup.py | 2 +- transport/mongo.py | 2 +- transport/s3.py | 5 ----- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 1d628d4..46719ba 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.2.4", + "version":"1.2.6", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/mongo.py b/transport/mongo.py index 04e2cad..e0e024d 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -71,7 +71,7 @@ class MongoReader(Mongo,Reader): key = 'n' if 'cursor' in out and out['cursor'][key] : r += list(out['cursor'][key]) - elif out[key]: + elif key in out and out[key]: r.append (out[key]) # yield out['cursor'][key] if key not in ['firstBatch','nextBatch'] or ('cursor' in out and out['cursor']['id'] == 0) : diff --git a/transport/s3.py b/transport/s3.py index 1a67317..6c26dd3 100644 --- a/transport/s3.py +++ b/transport/s3.py @@ -128,8 +128,3 @@ class s3Writer(s3,Writer) : self.s3.upload_fileobj(file,self.bucket_name,self.filename) pass -if __name__ == '__main__' : - p = {'access_key':'AKIAJO7KII27XH3TCPJQ','secret_key':'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'} - reader = s3Reader(**p) - buckets = reader.buckets() - print(reader.list(name = buckets[0])) From 4b49b0a060074ef24e52d1f00b507afefe79a4c7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 15 Nov 2020 21:19:40 -0600 Subject: [PATCH 037/271] bug fix with mongodb client --- setup.py | 2 +- transport/mongo.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 46719ba..1f1e4e4 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.2.6", + "version":"1.2.8", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/mongo.py b/transport/mongo.py index e0e024d..99cca05 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -48,7 +48,7 @@ class Mongo : q = self.uid in self.client[self.dbname].list_collection_names() return p and q def close(self): - self.db.close() + self.client.close() class MongoReader(Mongo,Reader): """ @@ -158,5 +158,7 @@ class MongoWriter(Mongo,Writer): else: collection.delete_many({}) self.write(info) + def close(self): + Mongo.close(self) # collecton.update_one({"_id":self.uid},document,True) From c36b07073a2db4d1040115943dc31960094e08e0 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 10 Dec 2020 03:42:30 -0600 Subject: [PATCH 038/271] bug fix with s3 hierarchy --- setup.py | 4 ++-- transport/s3.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 1f1e4e4..d1e6f83 100644 --- a/setup.py +++ b/setup.py @@ -8,12 +8,12 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.2.8", + "version":"1.3.0", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open'] +args["install_requires"] = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open','botocore'] args["url"] = "https://dev.the-phi.com/git/steve/data-transport.git" if sys.version_info[0] == 2 : diff --git a/transport/s3.py b/transport/s3.py index 6c26dd3..8af8e9d 100644 --- a/transport/s3.py +++ b/transport/s3.py @@ -115,7 +115,7 @@ class s3Reader(s3,Reader) : class s3Writer(s3,Writer) : - def __init__(self,args) : + def __init__(self,**args) : s3.__init__(self,args) def mkdir(self,name): """ From 2be9d9db2f75fec4efc41ae1a4cec65629e9ca93 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 10 Dec 2020 03:56:08 -0600 Subject: [PATCH 039/271] bug fix: aws s3 handler --- transport/s3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/s3.py b/transport/s3.py index 8af8e9d..339cb5c 100644 --- a/transport/s3.py +++ b/transport/s3.py @@ -116,7 +116,7 @@ class s3Reader(s3,Reader) : class s3Writer(s3,Writer) : def __init__(self,**args) : - s3.__init__(self,args) + s3.__init__(self,**args) def mkdir(self,name): """ This function will create a folder in a bucket From 6f08d64e499d438059acf366166de03567ab71ee Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 11 Dec 2020 02:46:31 -0600 Subject: [PATCH 040/271] bug fix: sqlite import pandas --- transport/disk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transport/disk.py b/transport/disk.py index 25ff336..1b9c648 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -7,6 +7,7 @@ else: import json from threading import Lock import sqlite3 +import pandas as pd class DiskReader(Reader) : """ From 03e0203c288ad54121420b7f4768ed9ea08232d6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 11 Dec 2020 03:22:49 -0600 Subject: [PATCH 041/271] bug fix: sqlite lock --- transport/disk.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/transport/disk.py b/transport/disk.py index 1b9c648..64cabd4 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -5,10 +5,10 @@ if sys.version_info[0] > 2 : else: from common import Reader,Writer import json -from threading import Lock +# from threading import Lock import sqlite3 import pandas as pd - +from multiprocessing import Lock class DiskReader(Reader) : """ This class is designed to read data from disk (location on hard drive) @@ -126,6 +126,8 @@ class SQLiteReader (DiskReader): pass class SQLiteWriter(DiskWriter) : + connection = None + LOCK = Lock() def __init__(self,**args): """ :path @@ -140,7 +142,7 @@ class SQLiteWriter(DiskWriter) : if self.fields and not self.isready(): self.init(self.fields) - + SQLiteWriter.connection = self.conn def init(self,fields): self.fields = fields; sql = " ".join(["CREATE TABLE IF NOT EXISTS ",self.table," (", ",".join(self.fields),")"]) @@ -175,14 +177,18 @@ class SQLiteWriter(DiskWriter) : if type(info) != list : info = [info] - cursor = self.conn.cursor() - - sql = " " .join(["INSERT INTO ",self.table,"(", ",".join(self.fields) ,")", "values(':values')"]) - for row in info : - stream = json.dumps(row) - stream = stream.replace("'","''") - cursor.execute(sql.replace(":values",stream) ) + SQLiteWriter.LOCK.acquire() + try: + cursor = self.conn.cursor() + sql = " " .join(["INSERT INTO ",self.table,"(", ",".join(self.fields) ,")", "values(':values')"]) + for row in info : + stream = json.dumps(row) + stream = stream.replace("'","''") + cursor.execute(sql.replace(":values",stream) ) + # self.conn.commit() # print (sql) - + except Exception as e : + pass + SQLiteWriter.LOCK.release() \ No newline at end of file From 7ba09309d3e7746d0405240bb8855e3281b284d5 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 26 Dec 2020 12:20:40 -0600 Subject: [PATCH 042/271] bug fix: mongodb commands with values --- setup.py | 2 +- transport/mongo.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d1e6f83..6bb0540 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.0", + "version":"1.3.2", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/mongo.py b/transport/mongo.py index 99cca05..f206482 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -65,6 +65,8 @@ class MongoReader(Mongo,Reader): out = self.db.command(cmd) #@TODO: consider using a yield (generator) works wonders while True : + if 'values' in out : + r += out['values'] if 'cursor' in out : key = 'firstBatch' if 'firstBatch' in out['cursor'] else 'nextBatch' else: From 504596f8385fed64c73998d25f4c9a2830c770fa Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 31 Dec 2020 11:12:34 -0600 Subject: [PATCH 043/271] new: added sql handler for Redshift,PostgreSQL, MySQL/Mariadb --- setup.py | 4 +- transport/sql.py | 145 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+), 2 deletions(-) create mode 100644 transport/sql.py diff --git a/setup.py b/setup.py index 6bb0540..2bbc294 100644 --- a/setup.py +++ b/setup.py @@ -8,12 +8,12 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.2", + "version":"1.3.4", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open','botocore'] +args["install_requires"] = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://dev.the-phi.com/git/steve/data-transport.git" if sys.version_info[0] == 2 : diff --git a/transport/sql.py b/transport/sql.py new file mode 100644 index 0000000..078f457 --- /dev/null +++ b/transport/sql.py @@ -0,0 +1,145 @@ +""" +This file is intended to perform read/writes against an SQL database such as PostgreSQL, Redshift, Mysql, MsSQL ... + +LICENSE (MIT) +Copyright 2016-2020, The Phi Technology LLC + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +""" +import psycopg2 as pg +import mysql.connector as my +import sys +if sys.version_info[0] > 2 : + from transport.common import Reader, Writer #, factory +else: + from common import Reader,Writer +import json +# from threading import Lock + +import pandas as pd + +class SQLRW : + PROVIDERS = {"postgresql":"5432","redshift":"5432","mysql":"3306","mariadb":"3306"} + DRIVERS = {"postgresql":pg,"redshift":pg,"mysql":my,"mariadb":my} + def __init__(self,**_args): + + + _info = {} + _info['dbname'] = _args['db'] + self.table = _args['table'] + self.fields = _args['fields'] if 'fields' in _args else [] + + if 'host' in _args : + _info['host'] = 'localhost' if 'host' not in _args else _args['host'] + _info['port'] = SQLWriter.PROVIDERS[_args['provider']] if 'port' not in _args else _args['port'] + + if 'username' in _args or 'user' in _args: + key = 'username' if 'username' in _args else 'user' + _info['user'] = _args[key] + _info['password'] = _args['password'] + # + # We need to load the drivers here to see what we are dealing with ... + _handler = SQLWriter.DRIVERS[_args['provider']] + self.conn = _handler.connect(**_info) + def apply(self,_sql): + """ + This function applies a command and/or a query against the current relational data-store + :param _sql insert/select statement + @TODO: Store procedure calls + """ + cursor = self.conn.cursor() + try: + if "insert" in _sql .lower() or "update" in _sql.lower() : + # Executing a command i.e no expected return values ... + cursor.execute(_sql) + else: + cursor.close() + return pd.read_sql(_sql,self.conn) + + finally: + cursor.close() + def close(self): + try: + self.connect.close() + except Exception as error : + print (error) + pass +class SQLReader(SQLRW,Reader) : + def __init__(self,**_args): + super().__init__(**_args) + def read(self,**_args): + if 'sql' in _args : + _sql = (_args['sql']) + else: + _sql = "SELECT :fields FROM "+self.table + if 'filter' in _args : + _sql = _sql +" WHERE "+_args['filter'] + _fields = '*' if not self.fields else ",".join(self.fields) + _sql = _sql.replace(":fields",_fields) + if 'limit' in _args : + _sql = _sql + " LIMIT "+str(_args['limit']) + return self.apply(_sql) + +class SQLWriter(SQLRW,Writer): + def __init__(self,**_args) : + super().__init__(**_args) + def init(self,fields): + if not fields : + try: + self.fields = pd.read_sql("SELECT * FROM :table LIMIT 1".replace(":table",self.table),self.conn).columns.tolist() + finally: + pass + else: + self.fields = fields; + + def make(self,fields): + self.fields = fields + sql = " ".join(["CREATE TABLE",self.table," (", ",".join(fields),")"]) + cursor = self.conn.cursor() + try: + cursor.execute(sql) + except Exception as e : + pass + finally: + cursor.close() + def write(self,info): + """ + :param info writes a list of data to a given set of fields + """ + if not self.fields : + _fields = info.keys() if type(info) == dict else info[0].keys() + _fields = list (_fields) + self.init(_fields) + + if type(info) != list : + info = [info] + cursor = self.conn.cursor() + try: + + _fields = ",".join(self.fields) + _sql = "INSERT INTO :table (:fields) values (:values)".replace(":table",self.table).replace(":fields",_fields) + _sql = _sql.replace(":values",",".join(["%("+name+")s" for name in self.fields])) + + # for row in info : + # values = ["'".join(["",value,""]) if not str(value).isnumeric() else value for value in row.values()] + cursor.executemany(_sql,info) + self.conn.commit() + except Exception as e: + print (e) + finally: + cursor.close() + pass + def close(self): + try: + self.conn.close() + finally: + pass + +_args = {"db":"sample","table":"foo","provider":"postgresql"} +# # w = SQLWriter(**_args) +# # w.write({"name":"kalara.io","email":"ceo@kalara.io","age":10}) +r = SQLReader(**_args) +print (r.read(filter='age > 0',limit = 20)) \ No newline at end of file From 99d29f0a1f1dbb0c3d3aa1fd8d8731c8cf8b21fd Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 2 Jan 2021 05:24:12 -0600 Subject: [PATCH 044/271] bug fix: sql writer --- transport/sql.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/transport/sql.py b/transport/sql.py index 078f457..69f3fcb 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -44,6 +44,14 @@ class SQLRW : # We need to load the drivers here to see what we are dealing with ... _handler = SQLWriter.DRIVERS[_args['provider']] self.conn = _handler.connect(**_info) + + def isready(self): + _sql = "SELECT * FROM :table LIMIT 1".replace(":table",self.table) + try: + return pd.read_sql(_sql,self.conn).columns.tolist() + except Exception as e: + pass + return False def apply(self,_sql): """ This function applies a command and/or a query against the current relational data-store @@ -138,8 +146,8 @@ class SQLWriter(SQLRW,Writer): finally: pass -_args = {"db":"sample","table":"foo","provider":"postgresql"} -# # w = SQLWriter(**_args) -# # w.write({"name":"kalara.io","email":"ceo@kalara.io","age":10}) -r = SQLReader(**_args) -print (r.read(filter='age > 0',limit = 20)) \ No newline at end of file +# _args = {"db":"sample","table":"foo","provider":"postgresql"} +# # # w = SQLWriter(**_args) +# # # w.write({"name":"kalara.io","email":"ceo@kalara.io","age":10}) +# r = SQLReader(**_args) +# print (r.read(filter='age > 0',limit = 20)) \ No newline at end of file From 18c9245aceaccfa34fb2bcc6c299c60263adb917 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 2 Jan 2021 05:29:52 -0600 Subject: [PATCH 045/271] bug fix: sql writer --- transport/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transport/__init__.py b/transport/__init__.py index e5a3418..14df482 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -55,6 +55,7 @@ if sys.version_info[0] > 2 : from transport import rabbitmq as queue from transport import couch as couch from transport import mongo as mongo + from transport import sql as sql else: from common import Reader, Writer #, factory import disk @@ -62,6 +63,7 @@ else: import couch import mongo import s3 + import sql class factory : From 898455f23cd5feddf84a4c69667ff36f9d2a73bf Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 2 Jan 2021 05:35:49 -0600 Subject: [PATCH 046/271] bug fix: sql writer::apply --- transport/sql.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transport/sql.py b/transport/sql.py index 69f3fcb..9f07730 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -60,12 +60,12 @@ class SQLRW : """ cursor = self.conn.cursor() try: - if "insert" in _sql .lower() or "update" in _sql.lower() : - # Executing a command i.e no expected return values ... - cursor.execute(_sql) - else: + if "select" in _sql.lower() : cursor.close() return pd.read_sql(_sql,self.conn) + else: + # Executing a command i.e no expected return values ... + cursor.execute(_sql) finally: cursor.close() From eef84d8069c1b89e5d43fd3b7d7797ceec3337fc Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 2 Jan 2021 05:38:06 -0600 Subject: [PATCH 047/271] bug fix: sql writer::apply --- transport/sql.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transport/sql.py b/transport/sql.py index 9f07730..32fc356 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -66,6 +66,7 @@ class SQLRW : else: # Executing a command i.e no expected return values ... cursor.execute(_sql) + self.conn.commit() finally: cursor.close() From fa615aa93157a789396ced37417e60dce7df187f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 2 Jan 2021 17:39:27 -0600 Subject: [PATCH 048/271] bug fix: insert on an etl with data-typing (tricky) --- transport/sql.py | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/transport/sql.py b/transport/sql.py index 32fc356..47ca33a 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -20,7 +20,7 @@ import json # from threading import Lock import pandas as pd - +import numpy as np class SQLRW : PROVIDERS = {"postgresql":"5432","redshift":"5432","mysql":"3306","mariadb":"3306"} DRIVERS = {"postgresql":pg,"redshift":pg,"mysql":my,"mariadb":my} @@ -95,6 +95,12 @@ class SQLReader(SQLRW,Reader) : class SQLWriter(SQLRW,Writer): def __init__(self,**_args) : super().__init__(**_args) + # + # In the advent that data typing is difficult to determine we can inspect and perform a default case + # This slows down the process but improves reliability of the data + # NOTE: Proper data type should be set on the target system if their source is unclear. + self._inspect = False if 'inspect' not in _args else _args['inspect'] + self._cast = False if 'cast' not in _args else _args['cast'] def init(self,fields): if not fields : try: @@ -118,6 +124,8 @@ class SQLWriter(SQLRW,Writer): """ :param info writes a list of data to a given set of fields """ + # inspect = False if 'inspect' not in _args else _args['inspect'] + # cast = False if 'cast' not in _args else _args['cast'] if not self.fields : _fields = info.keys() if type(info) == dict else info[0].keys() _fields = list (_fields) @@ -127,14 +135,31 @@ class SQLWriter(SQLRW,Writer): info = [info] cursor = self.conn.cursor() try: - - _fields = ",".join(self.fields) - _sql = "INSERT INTO :table (:fields) values (:values)".replace(":table",self.table).replace(":fields",_fields) - _sql = _sql.replace(":values",",".join(["%("+name+")s" for name in self.fields])) - - # for row in info : - # values = ["'".join(["",value,""]) if not str(value).isnumeric() else value for value in row.values()] - cursor.executemany(_sql,info) + _sql = "INSERT INTO :table (:fields) values (:values)".replace(":table",self.table) #.replace(":table",self.table).replace(":fields",_fields) + if self._inspect : + for _row in info : + fields = list(_row.keys()) + if self._cast == False : + values = ",".join(_row.values()) + else: + values = "'"+"','".join([str(value) for value in _row.values()])+"'" + + # values = [ "".join(["'",str(_row[key]),"'"]) if np.nan(_row[key]).isnumeric() else str(_row[key]) for key in _row] + # print (values) + query = _sql.replace(":fields",",".join(fields)).replace(":values",values) + + cursor.execute(query) + + + pass + else: + _fields = ",".join(self.fields) + _sql = _sql.replace(":fields",_fields) + _sql = _sql.replace(":values",",".join(["%("+name+")s" for name in self.fields])) + + # for row in info : + # values = ["'".join(["",value,""]) if not str(value).isnumeric() else value for value in row.values()] + cursor.executemany(_sql,info) self.conn.commit() except Exception as e: print (e) From 5bf2012c598f79dee694437ace73555174573d7a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 10 Jan 2021 09:55:04 -0600 Subject: [PATCH 049/271] bug fix: batch update fixes --- setup.py | 2 +- transport/sql.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 2bbc294..e273072 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.4", + "version":"1.3.6", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/sql.py b/transport/sql.py index 47ca33a..6ffad70 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -59,6 +59,7 @@ class SQLRW : @TODO: Store procedure calls """ cursor = self.conn.cursor() + _out = None try: if "select" in _sql.lower() : cursor.close() @@ -69,6 +70,7 @@ class SQLRW : self.conn.commit() finally: + self.conn.commit() cursor.close() def close(self): try: @@ -142,13 +144,14 @@ class SQLWriter(SQLRW,Writer): if self._cast == False : values = ",".join(_row.values()) else: - values = "'"+"','".join([str(value) for value in _row.values()])+"'" + # values = "'"+"','".join([str(value) for value in _row.values()])+"'" + values = [",".join(["%(",name,")s"]) for name in _row.keys()] # values = [ "".join(["'",str(_row[key]),"'"]) if np.nan(_row[key]).isnumeric() else str(_row[key]) for key in _row] # print (values) query = _sql.replace(":fields",",".join(fields)).replace(":values",values) - cursor.execute(query) + cursor.execute(query,_row.values()) pass @@ -160,10 +163,11 @@ class SQLWriter(SQLRW,Writer): # for row in info : # values = ["'".join(["",value,""]) if not str(value).isnumeric() else value for value in row.values()] cursor.executemany(_sql,info) - self.conn.commit() + # self.conn.commit() except Exception as e: print (e) finally: + self.conn.commit() cursor.close() pass def close(self): From 40f885b9145fbdf1653e1a4d3811751afd7974d1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 12 Jan 2021 09:34:03 -0600 Subject: [PATCH 050/271] bug fixes: reference --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e273072..6355c6f 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ args = { "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] args["install_requires"] = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] -args["url"] = "https://dev.the-phi.com/git/steve/data-transport.git" +args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" if sys.version_info[0] == 2 : args['use_2to3'] = True From 687ffec215e79b4809dcd1fb89472dcc40fbccbc Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 15 Mar 2021 12:38:54 -0500 Subject: [PATCH 051/271] bq support --- setup.py | 4 ++-- transport/sql.py | 61 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 6355c6f..5f09279 100644 --- a/setup.py +++ b/setup.py @@ -8,12 +8,12 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.6", + "version":"1.3.8", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] +args["install_requires"] = ['pymongo','numpy','cloudant','pika','boto','google-cloud-bigquery','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" if sys.version_info[0] == 2 : diff --git a/transport/sql.py b/transport/sql.py index 6ffad70..5e817ca 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -17,8 +17,8 @@ if sys.version_info[0] > 2 : else: from common import Reader,Writer import json -# from threading import Lock - +from google.oauth2 import service_account +from multiprocessing import Lock import pandas as pd import numpy as np class SQLRW : @@ -175,7 +175,64 @@ class SQLWriter(SQLRW,Writer): self.conn.close() finally: pass +class BigQuery: + def __init__(self,**_args): + path = _args['service_key'] + self.credentials = service_account.Credentials.from_service_account_file(path) + +class BQReader(BigQuery,Reader) : + def __init__(self,**_args): + + super().__init__(**_args) + + pass + def read(self,**_args): + SQL = None + if 'sql' in _args : + SQL = _args['sql'] + elif 'table' in _args: + + table = "".join(["`",_args['table'],"`"]) + SQL = "SELECT * FROM :table ".replace(":table",table) + if SQL and 'limit' in _args: + SQL += " LIMIT "+str(_args['limit']) + return pd.read_gbq(SQL,credentials=self.credentials,dialect='standard') if SQL else None +class BQWriter(BigQuery,Writer): + Lock = Lock() + def __init__(self,**_args): + super().__init__(**_args) + + self.parallel = False if 'lock' not in _args else _args['lock'] + self.table = _args['table'] if 'table' in _args else None + self.mode = {'if_exists':'append','chunksize':900000,'destination_table':self.table,'credentials':self.credentials} + + def write(self,_info,**_args) : + try: + if self.parallel : + self.lock.acquire() + self._write(_info,**_args) + finally: + if self.parallel: + self.lock.release() + def _write(self,_info,**_args) : + _df = None + if type(_info) in [list,pd.DataFrame] : + if type(_info) == list : + _df = pd.DataFrame(_info) + elif type(_info) == pd.DataFrame : + _df = _info + + self.mode['destination_table'] = _args['table'].strip() + _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) + + pass +# import transport +# reader = transport.factory.instance(type="sql.BQReader",args={"service_key":"/home/steve/dev/google-cloud-sdk/accounts/curation-prod.json"}) +# _df = reader.read(sql="select * from `2019q1r4_combined.person` limit 10") +# writer = transport.factory.instance(type="sql.BQWriter",args={"service_key":"/home/steve/dev/google-cloud-sdk/accounts/curation-prod.json"}) +# writer.write(_df,table='2019q1r4_combined.foo') +# write.write() # _args = {"db":"sample","table":"foo","provider":"postgresql"} # # # w = SQLWriter(**_args) # # # w.write({"name":"kalara.io","email":"ceo@kalara.io","age":10}) From afcc5ed690e7c73a14d412d64ef4891fcfec5274 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 29 Mar 2021 23:30:54 -0500 Subject: [PATCH 052/271] bug fix ... --- setup.py | 4 ++-- transport/sql.py | 45 ++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 5f09279..f316e24 100644 --- a/setup.py +++ b/setup.py @@ -8,12 +8,12 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.8", + "version":"1.3.8.1", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pymongo','numpy','cloudant','pika','boto','google-cloud-bigquery','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] +args["install_requires"] = ['pymongo','numpy','cloudant','pika','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" if sys.version_info[0] == 2 : diff --git a/transport/sql.py b/transport/sql.py index 5e817ca..dcf9e15 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -18,9 +18,13 @@ else: from common import Reader,Writer import json from google.oauth2 import service_account +from google.cloud import bigquery as bq from multiprocessing import Lock import pandas as pd import numpy as np +import copy + + class SQLRW : PROVIDERS = {"postgresql":"5432","redshift":"5432","mysql":"3306","mariadb":"3306"} DRIVERS = {"postgresql":pg,"redshift":pg,"mysql":my,"mariadb":my} @@ -177,9 +181,32 @@ class SQLWriter(SQLRW,Writer): pass class BigQuery: def __init__(self,**_args): - path = _args['service_key'] + path = _args['service_key'] if 'service_key' in _args else _args['private_key'] self.credentials = service_account.Credentials.from_service_account_file(path) - + self.dataset = _args['dataset'] if 'dataset' in _args else None + self.path = path + def meta(self,**_args): + """ + This function returns meta data for a given table or query with dataset/table properly formatted + :param table name of the name WITHOUT including dataset + :param sql sql query to be pulled, + """ + #if 'table' in _args : + # sql = "SELECT * from :dataset."+ _args['table']" limit 1" + #else: + # sql = _args['sql'] + # if 'limit' not in sql.lower() : + # sql = sql + ' limit 1' + + #sql = sql.replace(':dataset',self.dataset) if ':dataset' in args else sql + + # + # Let us return the schema information now for a given table + # + table = _args['table'] + client = bq.Client.from_service_account_json(self.path) + ref = client.dataset(self.dataset).table(table) + return client.get_table(ref).schema class BQReader(BigQuery,Reader) : def __init__(self,**_args): @@ -196,7 +223,8 @@ class BQReader(BigQuery,Reader) : SQL = "SELECT * FROM :table ".replace(":table",table) if SQL and 'limit' in _args: SQL += " LIMIT "+str(_args['limit']) - + if (':dataset' in SQL or ':DATASET' in SQL) and self.dataset: + SQL = SQL.replace(':dataset',self.dataset).replace(':DATASET',self.dataset) return pd.read_gbq(SQL,credentials=self.credentials,dialect='standard') if SQL else None class BQWriter(BigQuery,Writer): Lock = Lock() @@ -223,7 +251,14 @@ class BQWriter(BigQuery,Writer): elif type(_info) == pd.DataFrame : _df = _info - self.mode['destination_table'] = _args['table'].strip() + if '.' not in _args['table'] : + self.mode['destination_table'] = '.'.join([self.dataset,_args['table']]) + else: + + self.mode['destination_table'] = _args['table'].strip() + if 'schema' in _args : + self.mode['table_schema'] = _args['schema'] + _mode = copy.deepcopy(self.mode) _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) pass @@ -237,4 +272,4 @@ class BQWriter(BigQuery,Writer): # # # w = SQLWriter(**_args) # # # w.write({"name":"kalara.io","email":"ceo@kalara.io","age":10}) # r = SQLReader(**_args) -# print (r.read(filter='age > 0',limit = 20)) \ No newline at end of file +# print (r.read(filter='age > 0',limit = 20)) From 84e212d5a3ef0eeef4957d421c04604827ff420b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 13 Apr 2021 17:27:23 -0500 Subject: [PATCH 053/271] bug fix: lock, parallel processing bigquery --- setup.py | 2 +- transport/sql.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index f316e24..862dbaa 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.8.1", + "version":"1.3.8.2", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/sql.py b/transport/sql.py index dcf9e15..cd4aa2c 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -238,11 +238,11 @@ class BQWriter(BigQuery,Writer): def write(self,_info,**_args) : try: if self.parallel : - self.lock.acquire() + BQWriter.lock.acquire() self._write(_info,**_args) finally: if self.parallel: - self.lock.release() + BQWriter.lock.release() def _write(self,_info,**_args) : _df = None if type(_info) in [list,pd.DataFrame] : From cebc784494c7b3d43592c255476076d05addf680 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 13 Apr 2021 17:29:54 -0500 Subject: [PATCH 054/271] bug fix: lock, parallel processing bigquery --- setup.py | 2 +- transport/sql.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 862dbaa..997b5de 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.8.2", + "version":"1.3.8.3", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/sql.py b/transport/sql.py index cd4aa2c..180b962 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -227,7 +227,7 @@ class BQReader(BigQuery,Reader) : SQL = SQL.replace(':dataset',self.dataset).replace(':DATASET',self.dataset) return pd.read_gbq(SQL,credentials=self.credentials,dialect='standard') if SQL else None class BQWriter(BigQuery,Writer): - Lock = Lock() + lock = Lock() def __init__(self,**_args): super().__init__(**_args) From 91434ec32df6376a125bd3d67eaf8b9f83e78e66 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 14 Jun 2021 14:31:30 -0500 Subject: [PATCH 055/271] bug fix: pandas dtypes --- transport/sql.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/transport/sql.py b/transport/sql.py index 180b962..542b535 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -185,6 +185,7 @@ class BigQuery: self.credentials = service_account.Credentials.from_service_account_file(path) self.dataset = _args['dataset'] if 'dataset' in _args else None self.path = path + self.dtypes = _args['dtypes'] if 'dtypes' in _args else None def meta(self,**_args): """ This function returns meta data for a given table or query with dataset/table properly formatted @@ -225,7 +226,13 @@ class BQReader(BigQuery,Reader) : SQL += " LIMIT "+str(_args['limit']) if (':dataset' in SQL or ':DATASET' in SQL) and self.dataset: SQL = SQL.replace(':dataset',self.dataset).replace(':DATASET',self.dataset) - return pd.read_gbq(SQL,credentials=self.credentials,dialect='standard') if SQL else None + _info = {'credentials':self.credentials,'dialect':'standard'} + if 'dtypes' in _args or self.dtypes : + self.dtypes = _args ['dtypes'] if 'dtypes' in self.dtypes else None + if self.dtypes : + _info['dtypes'] = self.dtypes + return pd.read_gbq(SQL,*_info) if SQL else None + # return pd.read_gbq(SQL,credentials=self.credentials,dialect='standard') if SQL else None class BQWriter(BigQuery,Writer): lock = Lock() def __init__(self,**_args): @@ -237,7 +244,7 @@ class BQWriter(BigQuery,Writer): def write(self,_info,**_args) : try: - if self.parallel : + if self.parallel or 'lock' in _args : BQWriter.lock.acquire() self._write(_info,**_args) finally: From b5eb5e099122bcba06c7f2f3ac7ceed07ded2300 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 15 Jun 2021 12:30:48 -0500 Subject: [PATCH 056/271] bug fix: adding types to read function --- setup.py | 2 +- transport/sql.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 997b5de..c9cb4cf 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.8.3", + "version":"1.3.8.4", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/sql.py b/transport/sql.py index 542b535..198c3f3 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -228,10 +228,11 @@ class BQReader(BigQuery,Reader) : SQL = SQL.replace(':dataset',self.dataset).replace(':DATASET',self.dataset) _info = {'credentials':self.credentials,'dialect':'standard'} if 'dtypes' in _args or self.dtypes : - self.dtypes = _args ['dtypes'] if 'dtypes' in self.dtypes else None + if not self.dtypes : + self.dtypes = _args ['dtypes'] if 'dtypes' in _args else None if self.dtypes : _info['dtypes'] = self.dtypes - return pd.read_gbq(SQL,*_info) if SQL else None + return pd.read_gbq(SQL,**_info) if SQL else None # return pd.read_gbq(SQL,credentials=self.credentials,dialect='standard') if SQL else None class BQWriter(BigQuery,Writer): lock = Lock() From 4c810fe57a5ae65f431b8c4812bcacdacb3d4701 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 20 Jun 2021 18:56:12 -0500 Subject: [PATCH 057/271] bug fix --- transport/sql.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/transport/sql.py b/transport/sql.py index 198c3f3..6e526cc 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -226,12 +226,7 @@ class BQReader(BigQuery,Reader) : SQL += " LIMIT "+str(_args['limit']) if (':dataset' in SQL or ':DATASET' in SQL) and self.dataset: SQL = SQL.replace(':dataset',self.dataset).replace(':DATASET',self.dataset) - _info = {'credentials':self.credentials,'dialect':'standard'} - if 'dtypes' in _args or self.dtypes : - if not self.dtypes : - self.dtypes = _args ['dtypes'] if 'dtypes' in _args else None - if self.dtypes : - _info['dtypes'] = self.dtypes + _info = {'credentials':self.credentials,'dialect':'standard'} return pd.read_gbq(SQL,**_info) if SQL else None # return pd.read_gbq(SQL,credentials=self.credentials,dialect='standard') if SQL else None class BQWriter(BigQuery,Writer): From 7c2e9459960c54b62b46f4ca85b5c07d5ee342f5 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 29 Jun 2021 15:12:28 -0500 Subject: [PATCH 058/271] bug fix: support for netezza --- setup.py | 4 ++-- transport/sql.py | 55 ++++++++++++++++++++++++++++++++++++------------ 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/setup.py b/setup.py index c9cb4cf..a599ca0 100644 --- a/setup.py +++ b/setup.py @@ -8,12 +8,12 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.8.4", + "version":"1.3.8.6.1", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pymongo','numpy','cloudant','pika','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] +args["install_requires"] = ['pymongo','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" if sys.version_info[0] == 2 : diff --git a/transport/sql.py b/transport/sql.py index 6e526cc..72e12c6 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -22,23 +22,32 @@ from google.cloud import bigquery as bq from multiprocessing import Lock import pandas as pd import numpy as np +import nzpy as nz #--- netezza drivers import copy class SQLRW : - PROVIDERS = {"postgresql":"5432","redshift":"5432","mysql":"3306","mariadb":"3306"} - DRIVERS = {"postgresql":pg,"redshift":pg,"mysql":my,"mariadb":my} + PROVIDERS = {"postgresql":"5432","redshift":"5432","mysql":"3306","mariadb":"3306","netezza":5480} + DRIVERS = {"postgresql":pg,"redshift":pg,"mysql":my,"mariadb":my,"netezza":nz} + REFERENCE = { + "netezza":{"port":5480,"handler":nz,"dtype":"VARCHAR(512)"}, + "postgresql":{"port":5432,"handler":pg,"dtype":"VARCHAR"}, + "redshift":{"port":5432,"handler":pg,"dtype":"VARCHAR"}, + "mysql":{"port":3360,"handler":my,"dtype":"VARCHAR(256)"}, + "mariadb":{"port":3360,"handler":my,"dtype":"VARCHAR(256)"}, + } def __init__(self,**_args): _info = {} - _info['dbname'] = _args['db'] + _info['dbname'] = _args['db'] if 'db' in _args else _args['database'] self.table = _args['table'] self.fields = _args['fields'] if 'fields' in _args else [] - + _provider = _args['provider'] if 'host' in _args : _info['host'] = 'localhost' if 'host' not in _args else _args['host'] - _info['port'] = SQLWriter.PROVIDERS[_args['provider']] if 'port' not in _args else _args['port'] + # _info['port'] = SQLWriter.PROVIDERS[_args['provider']] if 'port' not in _args else _args['port'] + _info['port'] = SQLWriter.REFERENCE[_provider]['port'] if 'port' not in _args else _args['port'] if 'username' in _args or 'user' in _args: key = 'username' if 'username' in _args else 'user' @@ -46,7 +55,13 @@ class SQLRW : _info['password'] = _args['password'] # # We need to load the drivers here to see what we are dealing with ... - _handler = SQLWriter.DRIVERS[_args['provider']] + # _handler = SQLWriter.DRIVERS[_args['provider']] + _handler = SQLWriter.REFERENCE[_provider]['handler'] + self._dtype = SQLWriter.REFERENCE[_provider]['dtype'] if 'dtype' not in _args else _args['dtype'] + if _handler == nz : + _info['database'] = _info['dbname'] + _info['securityLevel'] = 0 + del _info['dbname'] self.conn = _handler.connect(**_info) def isready(self): @@ -118,11 +133,13 @@ class SQLWriter(SQLRW,Writer): def make(self,fields): self.fields = fields - sql = " ".join(["CREATE TABLE",self.table," (", ",".join(fields),")"]) + + sql = " ".join(["CREATE TABLE",self.table," (", ",".join([ name +' '+ self._dtype for name in fields]),")"]) cursor = self.conn.cursor() try: cursor.execute(sql) except Exception as e : + print (e) pass finally: cursor.close() @@ -136,12 +153,14 @@ class SQLWriter(SQLRW,Writer): _fields = info.keys() if type(info) == dict else info[0].keys() _fields = list (_fields) self.init(_fields) - + # + # @TODO: Use pandas/odbc ? Not sure b/c it requires sqlalchemy + # if type(info) != list : info = [info] cursor = self.conn.cursor() try: - _sql = "INSERT INTO :table (:fields) values (:values)".replace(":table",self.table) #.replace(":table",self.table).replace(":fields",_fields) + _sql = "INSERT INTO :table (:fields) VALUES (:values)".replace(":table",self.table) #.replace(":table",self.table).replace(":fields",_fields) if self._inspect : for _row in info : fields = list(_row.keys()) @@ -161,15 +180,19 @@ class SQLWriter(SQLRW,Writer): pass else: _fields = ",".join(self.fields) - _sql = _sql.replace(":fields",_fields) - _sql = _sql.replace(":values",",".join(["%("+name+")s" for name in self.fields])) + # _sql = _sql.replace(":fields",_fields) + # _sql = _sql.replace(":values",",".join(["%("+name+")s" for name in self.fields])) + _sql = _sql.replace("(:fields)","") + values = ", ".join('?'*len(self.fields)) + _sql = _sql.replace(":values",values) # for row in info : # values = ["'".join(["",value,""]) if not str(value).isnumeric() else value for value in row.values()] cursor.executemany(_sql,info) + # self.conn.commit() except Exception as e: - print (e) + pass finally: self.conn.commit() cursor.close() @@ -265,7 +288,13 @@ class BQWriter(BigQuery,Writer): _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) pass -# import transport +import transport +try: + _args = {'type':'sql.SQLWriter','args':{'provider':'netezza','host':'ori-netezza.vumc.org','table':'IBM_CCS_DX','username':'nyembsl1','password':'Innovat10n','database':'MALIN_OMOP_RD'}} + df = pd + reader = SQLReader(**_args['args']) +except Exception as error : + print (error) # reader = transport.factory.instance(type="sql.BQReader",args={"service_key":"/home/steve/dev/google-cloud-sdk/accounts/curation-prod.json"}) # _df = reader.read(sql="select * from `2019q1r4_combined.person` limit 10") # writer = transport.factory.instance(type="sql.BQWriter",args={"service_key":"/home/steve/dev/google-cloud-sdk/accounts/curation-prod.json"}) From 19ab5101253354934d47fcb841eab3afb693f1d4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 8 Jul 2021 17:31:29 -0500 Subject: [PATCH 059/271] cli etl tool --- bin/transport | 91 +++++++++++++++++++++++++++++++++++++++++++ setup.py | 4 +- transport/__init__.py | 6 ++- transport/sql.py | 14 +++++-- 4 files changed, 109 insertions(+), 6 deletions(-) create mode 100644 bin/transport diff --git a/bin/transport b/bin/transport new file mode 100644 index 0000000..9f016e4 --- /dev/null +++ b/bin/transport @@ -0,0 +1,91 @@ +#!/usr/bin/env python +__doc__ = """ +(c) 2018 - 2021 data-transport +steve@the-phi.com, The Phi Technology LLC +https://dev.the-phi.com/git/steve/data-transport.git + +This program performs ETL between 9 supported data sources : Couchdb, Mongodb, Mysql, Mariadb, PostgreSQL, Netezza,Redshift, Sqlite, File +Usage : + transport --config --procs +@TODO: Create tables if they don't exist for relational databases +""" +import pandas as pd +import numpy as np +import json +import sys +import transport +import time +from multiprocessing import Process +SYS_ARGS = {} +if len(sys.argv) > 1: + + N = len(sys.argv) + for i in range(1,N): + value = None + if sys.argv[i].startswith('--'): + key = sys.argv[i][2:] #.replace('-','') + SYS_ARGS[key] = 1 + if i + 1 < N: + value = sys.argv[i + 1] = sys.argv[i+1].strip() + if key and value and not value.startswith('--'): + SYS_ARGS[key] = value + + + i += 2 + +class Post(Process): + def __init__(self,**args): + super().__init__() + self.PROVIDER = args['target']['type'] + self.writer = transport.factory.instance(**args['target']) + self.rows = args['rows'] + def run(self): + _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows + self.writer.write(_info) + self.writer.close() + + +class ETL (Process): + def __init__(self,**_args): + super().__init__() + self.name = _args['id'] + self.reader = transport.factory.instance(**_args['source']) + self._oargs = _args['target'] #transport.factory.instance(**_args['target']) + self.JOB_COUNT = _args['jobs'] + # self.logger = transport.factory.instance(**_args['logger']) + def log(self,**_args) : + _args['name'] = self.name + print (_args) + def run(self): + idf = self.reader.read() + idf = pd.DataFrame(idf) + idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()] + self.log(rows=idf.shape[0],cols=idf.shape[1]) + + # + # writing the data to a designated data source + # + try: + self.log(module='write',action='partitioning') + rows = np.array_split(np.arange(idf.shape[0]),self.JOB_COUNT) + jobs = [] + for i in rows : + segment = idf.loc[i,:].to_dict(orient='records') + proc = Post(target = self._oargs,rows = segment) + jobs.append(proc) + proc.start() + + self.log(module='write',action='working ...') + while jobs : + jobs = [proc for proc in jobs if proc.is_alive()] + time.sleep(2) + self.log(module='write',action='completed') + except Exception as e: + print (e) +if __name__ == '__main__' : + _config = json.loads(open (SYS_ARGS['config']).read()) + _config['jobs'] = 10 if 'jobs' not in SYS_ARGS else SYS_ARGS['jobs'] + + for _config in _info : + etl = ETL (**_config) + etl.start() \ No newline at end of file diff --git a/setup.py b/setup.py index a599ca0..a974ef7 100644 --- a/setup.py +++ b/setup.py @@ -8,14 +8,14 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.8.6.1", + "version":"1.3.8.8", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] args["install_requires"] = ['pymongo','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" - +args['scripts'] = ['bin/transport'] if sys.version_info[0] == 2 : args['use_2to3'] = True args['use_2to3_exclude_fixers']=['lib2to3.fixes.fix_import'] diff --git a/transport/__init__.py b/transport/__init__.py index 14df482..289678e 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -42,7 +42,8 @@ The configuration for the data-store is as follows : } """ __author__ = 'The Phi Technology' -import numpy as np +import pandas as pd +import numpy as np import json import importlib import sys @@ -97,6 +98,9 @@ class factory : print(['Error ',e]) return anObject +import time + + # class Reader: # def __init__(self): # self.nrows = 0 diff --git a/transport/sql.py b/transport/sql.py index 72e12c6..96f3489 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -87,13 +87,14 @@ class SQLRW : # Executing a command i.e no expected return values ... cursor.execute(_sql) self.conn.commit() - + except Exception as e : + print (e) finally: self.conn.commit() cursor.close() def close(self): try: - self.connect.close() + self.conn.close() except Exception as error : print (error) pass @@ -112,6 +113,12 @@ class SQLReader(SQLRW,Reader) : if 'limit' in _args : _sql = _sql + " LIMIT "+str(_args['limit']) return self.apply(_sql) + def close(self) : + try: + self.conn.close() + except Exception as error : + print (error) + pass class SQLWriter(SQLRW,Writer): def __init__(self,**_args) : @@ -122,7 +129,7 @@ class SQLWriter(SQLRW,Writer): # NOTE: Proper data type should be set on the target system if their source is unclear. self._inspect = False if 'inspect' not in _args else _args['inspect'] self._cast = False if 'cast' not in _args else _args['cast'] - def init(self,fields): + def init(self,fields=None): if not fields : try: self.fields = pd.read_sql("SELECT * FROM :table LIMIT 1".replace(":table",self.table),self.conn).columns.tolist() @@ -192,6 +199,7 @@ class SQLWriter(SQLRW,Writer): # self.conn.commit() except Exception as e: + print(e) pass finally: self.conn.commit() From 25ed547474db456c22341f8a32c3328c88cbd1d8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 8 Jul 2021 17:37:35 -0500 Subject: [PATCH 060/271] bug fix --- bin/transport | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/transport b/bin/transport index 9f016e4..e0baf2f 100644 --- a/bin/transport +++ b/bin/transport @@ -84,7 +84,7 @@ class ETL (Process): print (e) if __name__ == '__main__' : _config = json.loads(open (SYS_ARGS['config']).read()) - _config['jobs'] = 10 if 'jobs' not in SYS_ARGS else SYS_ARGS['jobs'] + _config['jobs'] = 10 #if 'jobs' not in SYS_ARGS else SYS_ARGS['jobs'] for _config in _info : etl = ETL (**_config) From 6e2ee941acf5ea1a51b4b109aadb9dc3449dda0c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 8 Jul 2021 17:41:43 -0500 Subject: [PATCH 061/271] bug fix --- bin/transport | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/transport b/bin/transport index e0baf2f..b07483e 100644 --- a/bin/transport +++ b/bin/transport @@ -83,9 +83,10 @@ class ETL (Process): except Exception as e: print (e) if __name__ == '__main__' : - _config = json.loads(open (SYS_ARGS['config']).read()) - _config['jobs'] = 10 #if 'jobs' not in SYS_ARGS else SYS_ARGS['jobs'] + _info = json.loads(open (SYS_ARGS['config']).read()) + for _config in _info : + _config['jobs'] = 10 if 'jobs' not in SYS_ARGS else SYS_ARGS['jobs'] etl = ETL (**_config) etl.start() \ No newline at end of file From b21ddd69459f87c8562f73eb9938fd6df8ddc1d3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 19 Jul 2021 18:54:37 -0500 Subject: [PATCH 062/271] bug fix: removing testing code --- transport/sql.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/transport/sql.py b/transport/sql.py index 96f3489..4a8ae23 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -296,13 +296,13 @@ class BQWriter(BigQuery,Writer): _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) pass -import transport -try: - _args = {'type':'sql.SQLWriter','args':{'provider':'netezza','host':'ori-netezza.vumc.org','table':'IBM_CCS_DX','username':'nyembsl1','password':'Innovat10n','database':'MALIN_OMOP_RD'}} - df = pd - reader = SQLReader(**_args['args']) -except Exception as error : - print (error) +# import transport +# try: +# _args = {'type':'sql.SQLWriter','args':{'provider':'netezza','host':'ori-netezza.vumc.org','table':'IBM_CCS_DX','username':'nyembsl1','password':'Innovat10n','database':'MALIN_OMOP_RD'}} +# df = pd +# reader = SQLReader(**_args['args']) +# except Exception as error : +# print (error) # reader = transport.factory.instance(type="sql.BQReader",args={"service_key":"/home/steve/dev/google-cloud-sdk/accounts/curation-prod.json"}) # _df = reader.read(sql="select * from `2019q1r4_combined.person` limit 10") # writer = transport.factory.instance(type="sql.BQWriter",args={"service_key":"/home/steve/dev/google-cloud-sdk/accounts/curation-prod.json"}) From 835e1253a50916fa8e77580c1290a3994f459a99 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 19 Jul 2021 19:07:06 -0500 Subject: [PATCH 063/271] bug fix: filereader --- setup.py | 4 ++-- transport/disk.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index a974ef7..f26d1ad 100644 --- a/setup.py +++ b/setup.py @@ -8,12 +8,12 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.8.8", + "version":"1.3.9.0", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pymongo','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] +args["install_requires"] = ['pymongo','pandas','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] if sys.version_info[0] == 2 : diff --git a/transport/disk.py b/transport/disk.py index 64cabd4..7f73cf7 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -26,10 +26,13 @@ class DiskReader(Reader) : def isready(self): return os.path.exists(self.path) def read(self,**args): + return pd.read_csv(self.path,delimiter=self.delimiter) + def stream(self,**args): """ This function reads the rows from a designated location on disk @param size number of rows to be read, -1 suggests all rows """ + size = -1 if 'size' not in args else int(args['size']) f = open(self.path,'rU') i = 1 @@ -39,7 +42,7 @@ class DiskReader(Reader) : if size == i: break if self.delimiter : - yield row.split(self.char) + yield row.split(self.delimiter) yield row f.close() class DiskWriter(Writer): From d3416631bb0488c40d0f44018e67df979abea5f1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 19 Jul 2021 19:40:36 -0500 Subject: [PATCH 064/271] bug fix: cli arguments source --- bin/transport | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/transport b/bin/transport index b07483e..197e26e 100644 --- a/bin/transport +++ b/bin/transport @@ -84,7 +84,8 @@ class ETL (Process): print (e) if __name__ == '__main__' : _info = json.loads(open (SYS_ARGS['config']).read()) - + if 'source' in SYS_ARGS : + _info['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}} for _config in _info : _config['jobs'] = 10 if 'jobs' not in SYS_ARGS else SYS_ARGS['jobs'] From 185158f006904484b1642be5bc86efeb887400c3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 19 Jul 2021 19:42:51 -0500 Subject: [PATCH 065/271] bug fix: cli arguments source --- bin/transport | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/transport b/bin/transport index 197e26e..48e6ff7 100644 --- a/bin/transport +++ b/bin/transport @@ -84,10 +84,11 @@ class ETL (Process): print (e) if __name__ == '__main__' : _info = json.loads(open (SYS_ARGS['config']).read()) - if 'source' in SYS_ARGS : - _info['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}} for _config in _info : + if 'source' in SYS_ARGS : + _config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}} + _config['jobs'] = 10 if 'jobs' not in SYS_ARGS else SYS_ARGS['jobs'] etl = ETL (**_config) etl.start() \ No newline at end of file From 79cdc0c0d03e6cd3728e29369e3e212b90397c1d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 23 Jul 2021 15:22:23 -0500 Subject: [PATCH 066/271] bug fix, enhancement with pandas --- bin/transport | 5 +++-- setup.py | 2 +- transport/mongo.py | 4 ++-- transport/sql.py | 13 +++++++++++-- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/bin/transport b/bin/transport index 48e6ff7..9701c6f 100644 --- a/bin/transport +++ b/bin/transport @@ -41,6 +41,7 @@ class Post(Process): self.rows = args['rows'] def run(self): _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows + self.writer.write(_info) self.writer.close() @@ -70,7 +71,7 @@ class ETL (Process): rows = np.array_split(np.arange(idf.shape[0]),self.JOB_COUNT) jobs = [] for i in rows : - segment = idf.loc[i,:].to_dict(orient='records') + segment = idf.loc[i,:] #.to_dict(orient='records') proc = Post(target = self._oargs,rows = segment) jobs.append(proc) proc.start() @@ -89,6 +90,6 @@ if __name__ == '__main__' : if 'source' in SYS_ARGS : _config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}} - _config['jobs'] = 10 if 'jobs' not in SYS_ARGS else SYS_ARGS['jobs'] + _config['jobs'] = 10 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs']) etl = ETL (**_config) etl.start() \ No newline at end of file diff --git a/setup.py b/setup.py index f26d1ad..8850ae6 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.9.0", + "version":"1.3.9.2", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/mongo.py b/transport/mongo.py index f206482..4a96c6e 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -142,8 +142,8 @@ class MongoWriter(Mongo,Writer): # if type(info) == list : # self.db[self.uid].insert_many(info) # else: - if (type(info) == list) : - self.db[self.uid].insert_many(info) + if type(info) == list or type(info) == pd.DataFrame : + self.db[self.uid].insert_many(info if type(info) == list else info.to_dict(orient='records')) else: self.db[self.uid].insert_one(info) def set(self,document): diff --git a/transport/sql.py b/transport/sql.py index 4a8ae23..143b93d 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -157,14 +157,23 @@ class SQLWriter(SQLRW,Writer): # inspect = False if 'inspect' not in _args else _args['inspect'] # cast = False if 'cast' not in _args else _args['cast'] if not self.fields : - _fields = info.keys() if type(info) == dict else info[0].keys() + if type(info) == list : + _fields = info[0].keys() + elif type(info) == dict : + _fields = info.keys() + elif type(info) == pd.DataFrame : + _fields = info.columns + + # _fields = info.keys() if type(info) == dict else info[0].keys() _fields = list (_fields) self.init(_fields) # # @TODO: Use pandas/odbc ? Not sure b/c it requires sqlalchemy # if type(info) != list : - info = [info] + # + # We are assuming 2 cases i.e dict or pd.DataFrame + info = [info] if type(info) == dict else info.values.tolist() cursor = self.conn.cursor() try: _sql = "INSERT INTO :table (:fields) VALUES (:values)".replace(":table",self.table) #.replace(":table",self.table).replace(":fields",_fields) From 10adde7a083d6c04ec860ceb58bdc77cc1bc27b5 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 28 Jul 2021 22:32:07 -0500 Subject: [PATCH 067/271] bug fix: sqlwriter --- bin/transport | 51 ++++++++++++++++++++++++++++++++++++------------ transport/sql.py | 5 ++++- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/bin/transport b/bin/transport index 9701c6f..a4d4cc4 100644 --- a/bin/transport +++ b/bin/transport @@ -40,8 +40,7 @@ class Post(Process): self.writer = transport.factory.instance(**args['target']) self.rows = args['rows'] def run(self): - _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows - + _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows self.writer.write(_info) self.writer.close() @@ -53,6 +52,7 @@ class ETL (Process): self.reader = transport.factory.instance(**_args['source']) self._oargs = _args['target'] #transport.factory.instance(**_args['target']) self.JOB_COUNT = _args['jobs'] + self.jobs = [] # self.logger = transport.factory.instance(**_args['logger']) def log(self,**_args) : _args['name'] = self.name @@ -61,7 +61,7 @@ class ETL (Process): idf = self.reader.read() idf = pd.DataFrame(idf) idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()] - self.log(rows=idf.shape[0],cols=idf.shape[1]) + self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT) # # writing the data to a designated data source @@ -69,27 +69,52 @@ class ETL (Process): try: self.log(module='write',action='partitioning') rows = np.array_split(np.arange(idf.shape[0]),self.JOB_COUNT) - jobs = [] + for i in rows : + _id = 'segment #'.join([str(rows.index(i)),self.name]) segment = idf.loc[i,:] #.to_dict(orient='records') - proc = Post(target = self._oargs,rows = segment) - jobs.append(proc) + proc = Post(target = self._oargs,rows = segment,name=_id) + self.jobs.append(proc) proc.start() - self.log(module='write',action='working ...') - while jobs : - jobs = [proc for proc in jobs if proc.is_alive()] - time.sleep(2) - self.log(module='write',action='completed') + self.log(module='write',action='working ...',name=self.name) + except Exception as e: print (e) + + def is_done(self): + self.jobs = [proc for proc in self.jobs if proc.is_alive()] + return len(self.jobs) == 0 +def apply(_args) : + """ + This function will apply a set of commands against a data-store. The expected structure is as follows : + {"store":...,"apply":[]} + """ + handler = transport.factory.instance(**_args['store']) + for cmd in _args['apply'] : + handler.apply(cmd) + handler.close() if __name__ == '__main__' : _info = json.loads(open (SYS_ARGS['config']).read()) - + index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else None + procs = [] for _config in _info : if 'source' in SYS_ARGS : _config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}} _config['jobs'] = 10 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs']) etl = ETL (**_config) - etl.start() \ No newline at end of file + etl.start() + procs.append(etl) + if index and _info.index(_config) == index : + break + # + # + N = len(procs) + while procs : + procs = [thread for thread in procs if not thread.is_done()] + if len(procs) < N : + print (["Finished ",(N-len(procs)), " remaining ", len(procs)]) + N = len(procs) + time.sleep(1) + print ("We're done !!") \ No newline at end of file diff --git a/transport/sql.py b/transport/sql.py index 143b93d..fb1d8e7 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -58,6 +58,7 @@ class SQLRW : # _handler = SQLWriter.DRIVERS[_args['provider']] _handler = SQLWriter.REFERENCE[_provider]['handler'] self._dtype = SQLWriter.REFERENCE[_provider]['dtype'] if 'dtype' not in _args else _args['dtype'] + self._provider = _provider if _handler == nz : _info['database'] = _info['dbname'] _info['securityLevel'] = 0 @@ -199,11 +200,13 @@ class SQLWriter(SQLRW,Writer): # _sql = _sql.replace(":fields",_fields) # _sql = _sql.replace(":values",",".join(["%("+name+")s" for name in self.fields])) _sql = _sql.replace("(:fields)","") - values = ", ".join('?'*len(self.fields)) + values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) _sql = _sql.replace(":values",values) + print (_sql) # for row in info : # values = ["'".join(["",value,""]) if not str(value).isnumeric() else value for value in row.values()] + cursor.executemany(_sql,info) # self.conn.commit() From 4e498ebcf0db0a0e3ebe78d12dd0ba44185aaac0 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 28 Jul 2021 22:45:52 -0500 Subject: [PATCH 068/271] bug fix: sqlwriter --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8850ae6..99d0930 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.9.2", + "version":"1.3.9.4", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} From 83cc1778c72dbc6915f61f46433c46194fed6fd4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 28 Jul 2021 22:51:19 -0500 Subject: [PATCH 069/271] bug fix: print --- transport/sql.py | 1 - 1 file changed, 1 deletion(-) diff --git a/transport/sql.py b/transport/sql.py index fb1d8e7..5d08032 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -202,7 +202,6 @@ class SQLWriter(SQLRW,Writer): _sql = _sql.replace("(:fields)","") values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) _sql = _sql.replace(":values",values) - print (_sql) # for row in info : # values = ["'".join(["",value,""]) if not str(value).isnumeric() else value for value in row.values()] From 0c0eaf7063631653dde61611208cac5c64acf5e6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 1 Sep 2021 02:33:40 -0500 Subject: [PATCH 070/271] bug fix: imports, testing code removed --- transport/mongo.py | 2 ++ transport/sql.py | 17 ----------------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/transport/mongo.py b/transport/mongo.py index 4a96c6e..84200f6 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -9,6 +9,8 @@ from bson.objectid import ObjectId from bson.binary import Binary import json from datetime import datetime +import pandas as pd + import gridfs # from transport import Reader,Writer import sys diff --git a/transport/sql.py b/transport/sql.py index 5d08032..7492113 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -307,20 +307,3 @@ class BQWriter(BigQuery,Writer): _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) pass -# import transport -# try: -# _args = {'type':'sql.SQLWriter','args':{'provider':'netezza','host':'ori-netezza.vumc.org','table':'IBM_CCS_DX','username':'nyembsl1','password':'Innovat10n','database':'MALIN_OMOP_RD'}} -# df = pd -# reader = SQLReader(**_args['args']) -# except Exception as error : -# print (error) -# reader = transport.factory.instance(type="sql.BQReader",args={"service_key":"/home/steve/dev/google-cloud-sdk/accounts/curation-prod.json"}) -# _df = reader.read(sql="select * from `2019q1r4_combined.person` limit 10") -# writer = transport.factory.instance(type="sql.BQWriter",args={"service_key":"/home/steve/dev/google-cloud-sdk/accounts/curation-prod.json"}) -# writer.write(_df,table='2019q1r4_combined.foo') -# write.write() -# _args = {"db":"sample","table":"foo","provider":"postgresql"} -# # # w = SQLWriter(**_args) -# # # w.write({"name":"kalara.io","email":"ceo@kalara.io","age":10}) -# r = SQLReader(**_args) -# print (r.read(filter='age > 0',limit = 20)) From 135d4ef3f644e4fa1415c39f556d8167f9b37be4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 1 Sep 2021 02:34:41 -0500 Subject: [PATCH 071/271] version: update --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 99d0930..aaf7364 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.9.4", + "version":"1.3.9.6", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} From 44ef71eb92b3c6c5e1254fc62a055194827776d2 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 3 Sep 2021 01:25:45 -0500 Subject: [PATCH 072/271] bug fix --- transport/sql.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transport/sql.py b/transport/sql.py index 7492113..844ba52 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -303,7 +303,8 @@ class BQWriter(BigQuery,Writer): self.mode['destination_table'] = _args['table'].strip() if 'schema' in _args : self.mode['table_schema'] = _args['schema'] - _mode = copy.deepcopy(self.mode) + # _mode = copy.deepcopy(self.mode) + _mode = self.mode _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) pass From 2de6e51bdbd4c03ae9705a91b37f402f6c95e21a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 18 Nov 2021 15:21:26 -0600 Subject: [PATCH 073/271] bug fix: simplyfying factory interface --- bin/transport | 10 +++ setup.py | 2 +- transport/__init__.py | 180 +++++++++++++----------------------------- transport/disk.py | 8 +- transport/mongo.py | 2 +- transport/sql.py | 59 ++++++++------ 6 files changed, 106 insertions(+), 155 deletions(-) diff --git a/bin/transport b/bin/transport index a4d4cc4..b2c2503 100644 --- a/bin/transport +++ b/bin/transport @@ -5,9 +5,19 @@ steve@the-phi.com, The Phi Technology LLC https://dev.the-phi.com/git/steve/data-transport.git This program performs ETL between 9 supported data sources : Couchdb, Mongodb, Mysql, Mariadb, PostgreSQL, Netezza,Redshift, Sqlite, File +LICENSE (MIT) +Copyright 2016-2020, The Phi Technology LLC + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Usage : transport --config --procs @TODO: Create tables if they don't exist for relational databases + + """ import pandas as pd import numpy as np diff --git a/setup.py b/setup.py index aaf7364..fdb98a6 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.3.9.6", + "version":"1.4.0", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/__init__.py b/transport/__init__.py index 289678e..55ab7b0 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -65,9 +65,36 @@ else: import mongo import s3 import sql +import psycopg2 as pg +import mysql.connector as my +from google.cloud import bigquery as bq +import nzpy as nz #--- netezza drivers +import os - +RDBMS = { + + "postgresql":{"port":"5432","driver":pg}, + "redshift":{"port":"5432","driver":pg}, + "netezza":{"port":"5480","driver":nz}, + "mysql":{"port":"3306","driver":my}, + "mariadb":{"port":"3306","driver":my}, + "mongodb":{"port":"27017","class":{"read"}}, + "couchdb":{"port":"5984"} +} class factory : + TYPE = {"sql":{"providers":["postgresql","mysql","neteeza","bigquery","mariadb","redshift"]}} + PROVIDERS = { + "file":{"class":{"read":disk.DiskReader,"write":disk.DiskWriter}}, + "sqlite":{"class":{"read":disk.SQLiteReader,"write":disk.SQLiteWriter}}, + "postgresql":{"port":5432,"host":"localhost","database":os.environ['USER'],"driver":pg,"default":{"type":"VARCHAR"}}, + "redshift":{"port":5432,"host":"localhost","database":os.environ['USER'],"driver":pg,"default":{"type":"VARCHAR"}}, + "bigquery":{"class":{"read":sql.BQReader,"write":sql.BQWriter}}, + "mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"}}, + "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"}}, + "mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}}, + "couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}}, + "netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"}}} + @staticmethod def instance(**args): """ @@ -99,131 +126,30 @@ class factory : return anObject import time - - -# class Reader: -# def __init__(self): -# self.nrows = 0 -# self.xchar = None - -# def row_count(self): -# content = self.read() -# return np.sum([1 for row in content]) -# def delimiter(self,sample): -# """ -# This function determines the most common delimiter from a subset of possible delimiters. -# It uses a statistical approach (distribution) to guage the distribution of columns for a given delimiter - -# :sample sample string/content expecting matrix i.e list of rows -# """ - -# m = {',':[],'\t':[],'|':[],'\x3A':[]} -# delim = m.keys() -# for row in sample: -# for xchar in delim: -# if row.split(xchar) > 1: -# m[xchar].append(len(row.split(xchar))) -# else: -# m[xchar].append(0) - - - -# # -# # The delimiter with the smallest variance, provided the mean is greater than 1 -# # This would be troublesome if there many broken records sampled -# # -# m = {id: np.var(m[id]) for id in m.keys() if m[id] != [] and int(np.mean(m[id]))>1} -# index = m.values().index( min(m.values())) -# xchar = m.keys()[index] - -# return xchar -# def col_count(self,sample): -# """ -# This function retirms the number of columns of a given sample -# @pre self.xchar is not None -# """ - -# m = {} -# i = 0 - -# for row in sample: -# row = self.format(row) -# id = str(len(row)) -# #id = str(len(row.split(self.xchar))) - -# if id not in m: -# m[id] = 0 -# m[id] = m[id] + 1 - -# index = m.values().index( max(m.values()) ) -# ncols = int(m.keys()[index]) +def instance(provider,context,**_args): + """ + + @param provider {file,sqlite,postgresql,redshift,bigquery,netezza,mongo,couch ...} + @param context read|write|rw + @param _args argument to got with the datastore (username,password,host,port ...) + """ + _id = context if context in ['read','write'] else None + if _id : + args = {'provider':_id} + for key in factory.PROVIDERS[provider] : + if key == 'class' : + continue + value = factory.PROVIDERS[provider][key] + args[key] = value + # + # + args = dict(args,**_args) - -# return ncols; -# def format (self,row): -# """ -# This function will clean records of a given row by removing non-ascii characters -# @pre self.xchar is not None -# """ - -# if isinstance(row,list) == False: -# # -# # We've observed sometimes fields contain delimiter as a legitimate character, we need to be able to account for this and not tamper with the field values (unless necessary) -# cols = self.split(row) -# #cols = row.split(self.xchar) -# else: -# cols = row ; -# return [ re.sub('[^\x00-\x7F,\n,\r,\v,\b,]',' ',col.strip()).strip().replace('"','') for col in cols] - -# def split (self,row): -# """ -# This function performs a split of a record and tries to attempt to preserve the integrity of the data within i.e accounting for the double quotes. -# @pre : self.xchar is not None -# """ - -# pattern = "".join(["(?:^|",self.xchar,")(\"(?:[^\"]+|\"\")*\"|[^",self.xchar,"]*)"]) -# return re.findall(pattern,row.replace('\n','')) - - -# class Writer: - -# def format(self,row,xchar): -# if xchar is not None and isinstance(row,list): -# return xchar.join(row)+'\n' -# elif xchar is None and isinstance(row,dict): -# row = json.dumps(row) -# return row -# """ -# It is important to be able to archive data so as to insure that growth is controlled -# Nothing in nature grows indefinitely neither should data being handled. -# """ -# def archive(self): -# pass -# def flush(self): -# pass - -# class factory : -# @staticmethod -# def instance(**args): - -# source = args['type'] -# params = args['args'] -# anObject = None - -# if source in ['HttpRequestReader','HttpSessionWriter']: -# # -# # @TODO: Make sure objects are serializable, be smart about them !! -# # -# aClassName = ''.join([source,'(**params)']) - + # print (provider in factory.PROVIDERS) + if 'class' in factory.PROVIDERS[provider]: + pointer = factory.PROVIDERS[provider]['class'][_id] + else: + pointer = sql.SQLReader if _id == 'read' else sql.SQLWriter + return pointer(**args) -# else: - -# stream = json.dumps(params) -# aClassName = ''.join([source,'(**',stream,')']) -# try: -# anObject = eval( aClassName) -# #setattr(anObject,'name',source) -# except Exception,e: -# print ['Error ',e] -# return anObject + return None \ No newline at end of file diff --git a/transport/disk.py b/transport/disk.py index 7f73cf7..0f1f6fb 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -22,10 +22,12 @@ class DiskReader(Reader) : Reader.__init__(self) self.path = params['path'] ; - self.delimiter = params['delimiter'] if 'delimiter' in params else None + self.delimiter = params['delimiter'] if 'delimiter' in params else ',' def isready(self): return os.path.exists(self.path) def read(self,**args): + _path = self.path if 'path' not in args else args['path'] + _delimiter = self.delimiter if 'delimiter' not in args else args['delimiter'] return pd.read_csv(self.path,delimiter=self.delimiter) def stream(self,**args): """ @@ -121,6 +123,10 @@ class SQLiteReader (DiskReader): elif 'filter' in args : sql = "SELECT :fields FROM ",self.table, "WHERE (:filter)".replace(":filter",args['filter']) sql = sql.replace(":fields",args['fields']) if 'fields' in args else sql.replace(":fields","*") + else: + sql = ' '.join(['SELECT * FROM ',self.table]) + if 'limit' in args : + sql = sql + " LIMIT "+args['limit'] return pd.read_sql(sql,self.conn) def close(self): try: diff --git a/transport/mongo.py b/transport/mongo.py index 84200f6..d1ee9ef 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -84,7 +84,7 @@ class MongoReader(Mongo,Reader): out = self.db.command({"getMore":out['cursor']['id'],"collection":out['cursor']['ns'].split(".")[-1]}) - return r + return pd.DataFrame(r) else: collection = self.db[self.uid] _filter = args['filter'] if 'filter' in args else {} diff --git a/transport/sql.py b/transport/sql.py index 844ba52..6d693d8 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -24,10 +24,11 @@ import pandas as pd import numpy as np import nzpy as nz #--- netezza drivers import copy +import os class SQLRW : - PROVIDERS = {"postgresql":"5432","redshift":"5432","mysql":"3306","mariadb":"3306","netezza":5480} + DRIVERS = {"postgresql":pg,"redshift":pg,"mysql":my,"mariadb":my,"netezza":nz} REFERENCE = { "netezza":{"port":5480,"handler":nz,"dtype":"VARCHAR(512)"}, @@ -41,13 +42,19 @@ class SQLRW : _info = {} _info['dbname'] = _args['db'] if 'db' in _args else _args['database'] - self.table = _args['table'] + self.table = _args['table'] if 'table' in _args else None self.fields = _args['fields'] if 'fields' in _args else [] - _provider = _args['provider'] - if 'host' in _args : - _info['host'] = 'localhost' if 'host' not in _args else _args['host'] - # _info['port'] = SQLWriter.PROVIDERS[_args['provider']] if 'port' not in _args else _args['port'] - _info['port'] = SQLWriter.REFERENCE[_provider]['port'] if 'port' not in _args else _args['port'] + # _provider = _args['provider'] + # _info['host'] = 'localhost' if 'host' not in _args else _args['host'] + # _info['port'] = SQLWriter.REFERENCE[_provider]['port'] if 'port' not in _args else _args['port'] + + _info['host'] = _args['host'] + _info['port'] = _args['port'] + + # if 'host' in _args : + # _info['host'] = 'localhost' if 'host' not in _args else _args['host'] + # # _info['port'] = SQLWriter.PROVIDERS[_args['provider']] if 'port' not in _args else _args['port'] + # _info['port'] = SQLWriter.REFERENCE[_provider]['port'] if 'port' not in _args else _args['port'] if 'username' in _args or 'user' in _args: key = 'username' if 'username' in _args else 'user' @@ -55,10 +62,14 @@ class SQLRW : _info['password'] = _args['password'] # # We need to load the drivers here to see what we are dealing with ... - # _handler = SQLWriter.DRIVERS[_args['provider']] - _handler = SQLWriter.REFERENCE[_provider]['handler'] - self._dtype = SQLWriter.REFERENCE[_provider]['dtype'] if 'dtype' not in _args else _args['dtype'] - self._provider = _provider + + + # _handler = SQLWriter.REFERENCE[_provider]['handler'] + _handler = _args['driver'] #-- handler to the driver + self._dtype = _args['default']['type'] if 'default' in _args and 'type' in _args['default'] else 'VARCHAR(256)' + self._provider = _args['provider'] + # self._dtype = SQLWriter.REFERENCE[_provider]['dtype'] if 'dtype' not in _args else _args['dtype'] + # self._provider = _provider if _handler == nz : _info['database'] = _info['dbname'] _info['securityLevel'] = 0 @@ -228,24 +239,13 @@ class BigQuery: self.dataset = _args['dataset'] if 'dataset' in _args else None self.path = path self.dtypes = _args['dtypes'] if 'dtypes' in _args else None + self.table = _args['table'] if 'table' in _args else None def meta(self,**_args): """ This function returns meta data for a given table or query with dataset/table properly formatted :param table name of the name WITHOUT including dataset :param sql sql query to be pulled, """ - #if 'table' in _args : - # sql = "SELECT * from :dataset."+ _args['table']" limit 1" - #else: - # sql = _args['sql'] - # if 'limit' not in sql.lower() : - # sql = sql + ' limit 1' - - #sql = sql.replace(':dataset',self.dataset) if ':dataset' in args else sql - - # - # Let us return the schema information now for a given table - # table = _args['table'] client = bq.Client.from_service_account_json(self.path) ref = client.dataset(self.dataset).table(table) @@ -258,12 +258,15 @@ class BQReader(BigQuery,Reader) : pass def read(self,**_args): SQL = None + table = self.table if 'table' not in _args else _args['table'] if 'sql' in _args : SQL = _args['sql'] - elif 'table' in _args: + elif table: - table = "".join(["`",_args['table'],"`"]) + table = "".join(["`",table,"`"]) if '.' in table else "".join(["`:dataset.",table,"`"]) SQL = "SELECT * FROM :table ".replace(":table",table) + if not SQL : + return None if SQL and 'limit' in _args: SQL += " LIMIT "+str(_args['limit']) if (':dataset' in SQL or ':DATASET' in SQL) and self.dataset: @@ -271,6 +274,7 @@ class BQReader(BigQuery,Reader) : _info = {'credentials':self.credentials,'dialect':'standard'} return pd.read_gbq(SQL,**_info) if SQL else None # return pd.read_gbq(SQL,credentials=self.credentials,dialect='standard') if SQL else None + class BQWriter(BigQuery,Writer): lock = Lock() def __init__(self,**_args): @@ -308,3 +312,8 @@ class BQWriter(BigQuery,Writer): _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) pass +# +# Aliasing the big query classes allowing it to be backward compatible +# +BigQueryReader = BQReader +BigQueryWriter = BQWriter \ No newline at end of file From ccc05acc01f436770a6e2cd0260d9b96ee5d6dbc Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 9 Dec 2021 15:25:58 -0600 Subject: [PATCH 074/271] bug fix: etl engine, sqlite inserts --- bin/transport | 52 +++++++++++++++++++++++++++++++++++++------ setup.py | 2 +- transport/__init__.py | 27 +++++++++++----------- transport/disk.py | 22 ++++++++++++------ 4 files changed, 74 insertions(+), 29 deletions(-) mode change 100644 => 100755 bin/transport diff --git a/bin/transport b/bin/transport old mode 100644 new mode 100755 index b2c2503..01c5f71 --- a/bin/transport +++ b/bin/transport @@ -16,7 +16,17 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI Usage : transport --config --procs @TODO: Create tables if they don't exist for relational databases +example of configuration : +1. Move data from a folder to a data-store + transport [--folder ] --config #-- assuming the configuration doesn't have folder + transport --folder --provider -- --table|doc +In this case the configuration should look like : + {folder:..., target:{}} +2. Move data from one source to another + transport --config + {source:{..},target:{..}} or [{source:{..},target:{..}},{source:{..},target:{..}}] + """ import pandas as pd @@ -46,11 +56,23 @@ if len(sys.argv) > 1: class Post(Process): def __init__(self,**args): super().__init__() - self.PROVIDER = args['target']['type'] - self.writer = transport.factory.instance(**args['target']) + + if 'provider' not in args['target'] : + self.PROVIDER = args['target']['type'] + self.writer = transport.factory.instance(**args['target']) + else: + self.PROVIDER = args['target']['provider'] + args['target']['context'] = 'write' + + self.writer = transport.instance(**args['target']) + # + # If the table doesn't exists maybe create it ? + # self.rows = args['rows'] + def run(self): - _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows + _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows + self.writer.write(_info) self.writer.close() @@ -59,7 +81,19 @@ class ETL (Process): def __init__(self,**_args): super().__init__() self.name = _args['id'] - self.reader = transport.factory.instance(**_args['source']) + if 'provider' not in _args['source'] : + #@deprecate + self.reader = transport.factory.instance(**_args['source']) + else: + # + # This is the new interface + _args['source']['context'] = 'read' + + self.reader = transport.instance(**_args['source']) + # + # do we have an sql query provided or not .... + # self.sql = _args['source']['sql'] if 'sql' in _args['source'] else None + self.cmd = _args['source']['cmd'] if 'cmd' in _args['source'] else None self._oargs = _args['target'] #transport.factory.instance(**_args['target']) self.JOB_COUNT = _args['jobs'] self.jobs = [] @@ -68,8 +102,11 @@ class ETL (Process): _args['name'] = self.name print (_args) def run(self): - idf = self.reader.read() - idf = pd.DataFrame(idf) + if self.cmd : + idf = self.reader.read(**self.cmd) + else: + idf = self.reader.read() + idf = pd.DataFrame(idf) idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()] self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT) @@ -79,7 +116,8 @@ class ETL (Process): try: self.log(module='write',action='partitioning') rows = np.array_split(np.arange(idf.shape[0]),self.JOB_COUNT) - + # + # @TODO: locks for i in rows : _id = 'segment #'.join([str(rows.index(i)),self.name]) segment = idf.loc[i,:] #.to_dict(orient='records') diff --git a/setup.py b/setup.py index fdb98a6..9d4ff7e 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.4.0", + "version":"1.4.1", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/__init__.py b/transport/__init__.py index 55ab7b0..5283f11 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -71,16 +71,7 @@ from google.cloud import bigquery as bq import nzpy as nz #--- netezza drivers import os -RDBMS = { - - "postgresql":{"port":"5432","driver":pg}, - "redshift":{"port":"5432","driver":pg}, - "netezza":{"port":"5480","driver":nz}, - "mysql":{"port":"3306","driver":my}, - "mariadb":{"port":"3306","driver":my}, - "mongodb":{"port":"27017","class":{"read"}}, - "couchdb":{"port":"5984"} -} + class factory : TYPE = {"sql":{"providers":["postgresql","mysql","neteeza","bigquery","mariadb","redshift"]}} PROVIDERS = { @@ -91,9 +82,14 @@ class factory : "bigquery":{"class":{"read":sql.BQReader,"write":sql.BQWriter}}, "mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"}}, "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"}}, - "mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}}, - "couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}}, + "mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}}, + "couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}}, "netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"}}} + # + # creating synonyms + PROVIDERS['mongodb'] = PROVIDERS['mongo'] + PROVIDERS['couchdb'] = PROVIDERS['couch'] + PROVIDERS['sqlite3'] = PROVIDERS['sqlite'] @staticmethod def instance(**args): @@ -126,14 +122,17 @@ class factory : return anObject import time -def instance(provider,context,**_args): +def instance(**_args): """ @param provider {file,sqlite,postgresql,redshift,bigquery,netezza,mongo,couch ...} @param context read|write|rw @param _args argument to got with the datastore (username,password,host,port ...) """ - _id = context if context in ['read','write'] else None + + provider = _args['provider'] + context = _args['context'] + _id = context if context in ['read','write'] else 'read' if _id : args = {'provider':_id} for key in factory.PROVIDERS[provider] : diff --git a/transport/disk.py b/transport/disk.py index 0f1f6fb..89ab75b 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -114,6 +114,7 @@ class DiskWriter(Writer): class SQLiteReader (DiskReader): def __init__(self,**args): DiskReader.__init__(self,**args) + self.path = args['database'] if 'database' in args else args['path'] self.conn = sqlite3.connect(self.path,isolation_level=None) self.conn.row_factory = sqlite3.Row self.table = args['table'] @@ -145,7 +146,7 @@ class SQLiteWriter(DiskWriter) : DiskWriter.__init__(self,**args) self.table = args['table'] - self.conn = sqlite3.connect(self.path,isolation_level=None) + self.conn = sqlite3.connect(self.path,isolation_level="IMMEDIATE") self.conn.row_factory = sqlite3.Row self.fields = args['fields'] if 'fields' in args else [] @@ -184,20 +185,27 @@ class SQLiteWriter(DiskWriter) : if not self.fields : self.init(list(info.keys())) - if type(info) != list : + if type(info) == object : info = [info] + elif type(info) == pd.DataFrame : + info = info.to_dict(orient='records') SQLiteWriter.LOCK.acquire() try: + cursor = self.conn.cursor() - sql = " " .join(["INSERT INTO ",self.table,"(", ",".join(self.fields) ,")", "values(':values')"]) + sql = " " .join(["INSERT INTO ",self.table,"(", ",".join(self.fields) ,")", "values(:values)"]) for row in info : - stream = json.dumps(row) - stream = stream.replace("'","''") - cursor.execute(sql.replace(":values",stream) ) + stream =["".join(["",value,""]) if type(value) == str else value for value in row.values()] + stream = json.dumps(stream).replace("[","").replace("]","") + + + self.conn.execute(sql.replace(":values",stream) ) + # cursor.commit() - # self.conn.commit() + self.conn.commit() # print (sql) except Exception as e : + print (e) pass SQLiteWriter.LOCK.release() \ No newline at end of file From b239a5149ff1f7497f069a2bfc84b4a7ce0ad4ac Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 29 Jan 2022 11:15:45 -0600 Subject: [PATCH 075/271] bug fixes and simplifying interfaces --- setup.py | 2 +- transport/__init__.py | 53 +++++++++++++++--------------------- transport/disk.py | 63 ++++++++++++++++++++++++++++++++++--------- transport/sql.py | 48 ++++++++++++++++++++++++++------- 4 files changed, 111 insertions(+), 55 deletions(-) diff --git a/setup.py b/setup.py index 9d4ff7e..ed82db4 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.4.1", + "version":"1.4.4", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/__init__.py b/transport/__init__.py index 5283f11..94b01eb 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -1,45 +1,24 @@ """ -Data Transport - 1.0 -Steve L. Nyemba, The Phi Technology LLC +Data Transport, The Phi Technology LLC +Steve L. Nyemba, steve@the-phi.com -This module is designed to serve as a wrapper to a set of supported data stores : +This library is designed to serve as a wrapper to a set of supported data stores : - couchdb - mongodb - Files (character delimited) - Queues (RabbmitMq) - Session (Flask) - s3 + - sqlite The supported operations are read/write and providing meta data to the calling code Requirements : pymongo boto couldant The configuration for the data-store is as follows : - couchdb: - { - args:{ - url:, - username:, - password:, - dbname:, - doc: - } - } - RabbitMQ: - { - - } - Mongodb: - { - args:{ - host:, #localhost:27017 - username:, - password:, - dbname:, - doc:s - - } - } + e.g: + mongodb + provider:'mongodb',[port:27017],[host:localhost],db:,doc:<_name>,context: """ __author__ = 'The Phi Technology' import pandas as pd @@ -90,9 +69,17 @@ class factory : PROVIDERS['mongodb'] = PROVIDERS['mongo'] PROVIDERS['couchdb'] = PROVIDERS['couch'] PROVIDERS['sqlite3'] = PROVIDERS['sqlite'] - + @staticmethod - def instance(**args): + def instance(**_args): + if 'type' in _args : + # + # Legacy code being returned + return factory._instance(**_args); + else: + return instance(**_args) + @staticmethod + def _instance(**args): """ This class will create an instance of a transport when providing :type name of the type we are trying to create @@ -131,7 +118,7 @@ def instance(**_args): """ provider = _args['provider'] - context = _args['context'] + context = _args['context']if 'context' in _args else None _id = context if context in ['read','write'] else 'read' if _id : args = {'provider':_id} @@ -142,6 +129,7 @@ def instance(**_args): args[key] = value # # + args = dict(args,**_args) # print (provider in factory.PROVIDERS) @@ -149,6 +137,7 @@ def instance(**_args): pointer = factory.PROVIDERS[provider]['class'][_id] else: pointer = sql.SQLReader if _id == 'read' else sql.SQLWriter + return pointer(**args) - return None \ No newline at end of file + return None diff --git a/transport/disk.py b/transport/disk.py index 89ab75b..14bb8a0 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -111,13 +111,47 @@ class DiskWriter(Writer): pass finally: DiskWriter.THREAD_LOCK.release() -class SQLiteReader (DiskReader): - def __init__(self,**args): - DiskReader.__init__(self,**args) - self.path = args['database'] if 'database' in args else args['path'] - self.conn = sqlite3.connect(self.path,isolation_level=None) +class SQLite : + def __init__(self,**_args) : + self.path = _args['database'] if 'database' in _args else _args['path'] + self.conn = sqlite3.connect(self.path,isolation_level="IMMEDIATE") self.conn.row_factory = sqlite3.Row - self.table = args['table'] + self.fields = _args['fields'] if 'fields' in _args else [] + def has (self,**_args): + found = False + try: + if 'table' in _args : + table = _args['table'] + sql = "SELECT * FROM :table limit 1".replace(":table",table) + _df = pd.read_sql(sql,self.conn) + found = _df.columns.size > 0 + except Exception as e: + pass + return found + def close(self): + try: + self.conn.close() + except Exception as e : + print(e) + def apply(self,sql): + try: + if not sql.lower().startswith('select'): + cursor = self.conn.cursor() + cursor.execute(sql) + cursor.close() + self.conn.commit() + else: + return pd.read_sql(sql,self.conn) + except Exception as e: + print (e) +class SQLiteReader (SQLite,DiskReader): + def __init__(self,**args): + super().__init__(**args) + # DiskReader.__init__(self,**args) + # self.path = args['database'] if 'database' in args else args['path'] + # self.conn = sqlite3.connect(self.path,isolation_level=None) + # self.conn.row_factory = sqlite3.Row + self.table = args['table'] if 'table' in args else None def read(self,**args): if 'sql' in args : sql = args['sql'] @@ -135,7 +169,7 @@ class SQLiteReader (DiskReader): except Exception as e : pass -class SQLiteWriter(DiskWriter) : +class SQLiteWriter(SQLite,DiskWriter) : connection = None LOCK = Lock() def __init__(self,**args): @@ -143,12 +177,13 @@ class SQLiteWriter(DiskWriter) : :path :fields json|csv """ - DiskWriter.__init__(self,**args) - self.table = args['table'] + # DiskWriter.__init__(self,**args) + super().__init__(**args) + self.table = args['table'] if 'table' in args else None - self.conn = sqlite3.connect(self.path,isolation_level="IMMEDIATE") - self.conn.row_factory = sqlite3.Row - self.fields = args['fields'] if 'fields' in args else [] + # self.conn = sqlite3.connect(self.path,isolation_level="IMMEDIATE") + # self.conn.row_factory = sqlite3.Row + # self.fields = args['fields'] if 'fields' in args else [] if self.fields and not self.isready(): self.init(self.fields) @@ -185,7 +220,7 @@ class SQLiteWriter(DiskWriter) : if not self.fields : self.init(list(info.keys())) - if type(info) == object : + if type(info) == dict : info = [info] elif type(info) == pd.DataFrame : info = info.to_dict(orient='records') @@ -196,6 +231,8 @@ class SQLiteWriter(DiskWriter) : cursor = self.conn.cursor() sql = " " .join(["INSERT INTO ",self.table,"(", ",".join(self.fields) ,")", "values(:values)"]) for row in info : + print (row) + print (row.values()) stream =["".join(["",value,""]) if type(value) == str else value for value in row.values()] stream = json.dumps(stream).replace("[","").replace("]","") diff --git a/transport/sql.py b/transport/sql.py index 6d693d8..c5c8f89 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -75,7 +75,15 @@ class SQLRW : _info['securityLevel'] = 0 del _info['dbname'] self.conn = _handler.connect(**_info) - + def has(self,**_args): + found = False + try: + table = _args['table'] + sql = "SELECT * FROM :table LIMIT 1".replace(":table",table) + found = pd.read_sql(sql,self.conn).shape[0] + except Exception as e: + pass + return found def isready(self): _sql = "SELECT * FROM :table LIMIT 1".replace(":table",self.table) try: @@ -201,8 +209,14 @@ class SQLWriter(SQLRW,Writer): # values = [ "".join(["'",str(_row[key]),"'"]) if np.nan(_row[key]).isnumeric() else str(_row[key]) for key in _row] # print (values) query = _sql.replace(":fields",",".join(fields)).replace(":values",values) - - cursor.execute(query,_row.values()) + if type(info) == pd.DataFrame : + _values = info.values.tolist() + elif type(info) == list and type(info[0]) == dict: + print ('........') + _values = [tuple(item.values()) for item in info] + else: + _values = info; + cursor.execute(query,_values) pass @@ -210,14 +224,23 @@ class SQLWriter(SQLRW,Writer): _fields = ",".join(self.fields) # _sql = _sql.replace(":fields",_fields) # _sql = _sql.replace(":values",",".join(["%("+name+")s" for name in self.fields])) - _sql = _sql.replace("(:fields)","") + # _sql = _sql.replace("(:fields)","") + _sql = _sql.replace(":fields",_fields) values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) _sql = _sql.replace(":values",values) - - # for row in info : - # values = ["'".join(["",value,""]) if not str(value).isnumeric() else value for value in row.values()] - - cursor.executemany(_sql,info) + if type(info) == pd.DataFrame : + _info = info[self.fields].values.tolist() + elif type(info) == dict : + _info = info.values() + else: + # _info = [] + + _info = pd.DataFrame(info)[self.fields].values.tolist() + # for row in info : + + # if type(row) == dict : + # _info.append( list(row.values())) + cursor.executemany(_sql,_info) # self.conn.commit() except Exception as e: @@ -250,6 +273,13 @@ class BigQuery: client = bq.Client.from_service_account_json(self.path) ref = client.dataset(self.dataset).table(table) return client.get_table(ref).schema + def has(self,**_args): + found = False + try: + found = self.meta(**_args) is not None + except Exception as e: + pass + return found class BQReader(BigQuery,Reader) : def __init__(self,**_args): From 69e0b4d946f7d73ec4a618a5f7f9a50492c729b0 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 29 Jan 2022 17:01:43 -0600 Subject: [PATCH 076/271] documentation --- README.md | 50 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 4e862af..4a4657c 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,31 @@ # Introduction -This project implements an abstraction of objects that can have access to a variety of data stores, implementing read/write with a simple interface against specific various data-sources. The supported data sources implement functionalities against : +This project implements an abstraction of objects that can have access to a variety of data stores, implementing read/write with a simple and expressive interface. This abstraction works with **NoSQL** and **SQL** data stores and leverages **pandas** - - Rabbitmq-server - - Couchdb-server - - Mongodb-server - - Http Session : {csv,tab,pipe,sql} - - Disk{Reader|Writer} : csv, tab, pipe, sql on disk +The supported data store providers : + +| Provider | Underlying Drivers | Description | +| ---- | ---| ---- | +| sqlite| Native SQLite|SQLite3| +| postgresql| psycopg2 | PostgreSQL +| redshift| psycopg2 | Amazon Redshift +| netezza| nzpsql | IBM Neteeza +| Files: CSV, TSV| pandas| pandas data-frame +| Couchdb| cloudant | Couchbase/Couchdb +| mongodb| pymongo | Mongodb +| mysql| mysql| Mysql +| bigquery| google-bigquery| Google BigQuery +| mariadb| mysql| Mariadb +| rabbitmq|pika| RabbitMQ Publish/Subscribe + +# Why Use Data-Transport ? + +Mostly data scientists that don't really care about the underlying database and would like to manipulate data transparently. + +1. Familiarity with **pandas data-frames** +2. Connectivity **drivers** are included +3. Useful for ETL -Such an interface is used to facilitate data transport in and out of a store for whatever an application may need (log, session management, ...) ### Installation @@ -21,23 +38,26 @@ Binaries and eggs will be provided later on ### Usage -The basic usage revolves around a factory class (to be a singleton) +In your code, perform the import transport from transport import factory # # importing a mongo reader args = {"host":":","dbname":"","doc":"",["username":"","password":""]} - mreader = factory.instance(type='mongo.MonoReader',args=args) + reader = factory.instance(provider='mongodb',doc=,db=) # - # reading a document and executing a view + # reading a document i.e just applying a find (no filters) # - document = mreader.read() - result = mreader.view(name) + df = mreader.read() #-- pandas data frame + df.head() + # - # importing a couchdb reader - args = {"url":":","dbname":"","doc":"","username":"","password":""} - creader = factory.instance(type='couch.CouchReader',args=args) + # reading from postgresql + + pgreader = factory.instance(type='postgresql',database=,table=) + pg.read() #-- will read the table by executing a SELECT + pg.read(sql=) # # Reading a document and executing a view From d0651ef6e6a9a214989c01c3b927f4ff73a06495 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 29 Jan 2022 17:18:20 -0600 Subject: [PATCH 077/271] documentation --- README.md | 55 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 4a4657c..805fb8f 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,11 @@ This project implements an abstraction of objects that can have access to a vari The supported data store providers : | Provider | Underlying Drivers | Description | -| ---- | ---| ---- | +| :---- | :----: | ----: | | sqlite| Native SQLite|SQLite3| | postgresql| psycopg2 | PostgreSQL | redshift| psycopg2 | Amazon Redshift +| s3| boto3 | Amazon Simple Storage Service | netezza| nzpsql | IBM Neteeza | Files: CSV, TSV| pandas| pandas data-frame | Couchdb| cloudant | Couchbase/Couchdb @@ -24,33 +25,51 @@ Mostly data scientists that don't really care about the underlying database and 1. Familiarity with **pandas data-frames** 2. Connectivity **drivers** are included -3. Useful for ETL +3. Useful for data migrations or ETL +# Usage -### Installation +## Installation -Within the virtual environment perform the following command: +Within the virtual environment perform the following : pip install git+https://dev.the-phi.com/git/steve/data-transport.git -Binaries and eggs will be provided later on -### Usage +## In code (Embedded) + +**Reading/Writing Mongodb** + +For this example we assume here we are tunneling through port 27018 and there is not access control: + +``` +import transport +reader = factory.instance(provider='mongodb',context='read',host='localhost',port='27018',db='example',doc='logs') + +df = reader.read() #-- reads the entire collection +print (df.head()) +# +#-- Applying mongodb command +PIPELINE = [{"$group":{"_id":None,"count":{"$sum":1}}}] +_command_={"cursor":{},"allowDiskUse":True,"aggregate":"logs","pipeline":PIPLINE} +df = reader.read(mongo=_command) +print (df.head()) +reader.close() +``` +**Writing to Mongodb** +--- +``` +import transport +improt pandas as pd +writer = factory.instance(provider='mongodb',context='write',host='localhost',port='27018',db='example',doc='logs') + +df = pd.DataFrame({"names":["steve","nico"],"age":[40,30]}) +writer.write(df) +writer.close() +``` -In your code, perform the - import transport - from transport import factory - # - # importing a mongo reader - args = {"host":":","dbname":"","doc":"",["username":"","password":""]} - reader = factory.instance(provider='mongodb',doc=,db=) - # - # reading a document i.e just applying a find (no filters) - # - df = mreader.read() #-- pandas data frame - df.head() # # reading from postgresql From a7df7bfbcec99901c87717e173decd32d4281d9e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 2 Mar 2022 13:15:35 -0600 Subject: [PATCH 078/271] bug fix: has sql --- setup.py | 2 +- transport/sql.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ed82db4..066ebc7 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.4.4", + "version":"1.4.5", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/sql.py b/transport/sql.py index c5c8f89..48d7777 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -80,7 +80,9 @@ class SQLRW : try: table = _args['table'] sql = "SELECT * FROM :table LIMIT 1".replace(":table",table) - found = pd.read_sql(sql,self.conn).shape[0] + found = pd.read_sql(sql,self.conn).shape[0] + found = True + except Exception as e: pass return found From 14a551e57b53e9154ceda3ce16985d76249704f4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 3 Mar 2022 16:08:24 -0600 Subject: [PATCH 079/271] Bug fix: sqlalchemy facilities added --- transport/__init__.py | 39 ++++++++++-- transport/sql.py | 135 +++++++++++++++++++++++++++++------------- 2 files changed, 130 insertions(+), 44 deletions(-) diff --git a/transport/__init__.py b/transport/__init__.py index 94b01eb..ce5090b 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -26,7 +26,7 @@ import numpy as np import json import importlib import sys - +import sqlalchemy if sys.version_info[0] > 2 : from transport.common import Reader, Writer #, factory from transport import disk @@ -59,8 +59,8 @@ class factory : "postgresql":{"port":5432,"host":"localhost","database":os.environ['USER'],"driver":pg,"default":{"type":"VARCHAR"}}, "redshift":{"port":5432,"host":"localhost","database":os.environ['USER'],"driver":pg,"default":{"type":"VARCHAR"}}, "bigquery":{"class":{"read":sql.BQReader,"write":sql.BQWriter}}, - "mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"}}, - "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"}}, + "mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my}, + "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my}, "mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}}, "couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}}, "netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"}}} @@ -137,7 +137,38 @@ def instance(**_args): pointer = factory.PROVIDERS[provider]['class'][_id] else: pointer = sql.SQLReader if _id == 'read' else sql.SQLWriter - + # + # Let us try to establish an sqlalchemy wrapper + try: + host = '' + if provider not in ['bigquery','mongodb','couchdb','sqlite'] : + # + # In these cases we are assuming RDBMS and thus would exclude NoSQL and BigQuery + username = args['username'] if 'username' in args else '' + password = args['password'] if 'password' in args else '' + if username == '' : + account = '' + else: + account = username + ':'+password+'@' + host = args['host'] + if 'port' in args : + host = host+":"+str(args['port']) + + database = args['database'] + elif provider == 'sqlite': + account = '' + host = '' + database = args['path'] if 'path' in args else args['database'] + if provider not in ['mongodb','couchdb','bigquery'] : + uri = ''.join([provider,"://",account,host,'/',database]) + + e = sqlalchemy.create_engine (uri) + args['sqlalchemy'] = e + # + # @TODO: Include handling of bigquery with SQLAlchemy + except Exception as e: + print (e) + return pointer(**args) return None diff --git a/transport/sql.py b/transport/sql.py index 48d7777..9ccccdb 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -12,6 +12,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI import psycopg2 as pg import mysql.connector as my import sys + +import sqlalchemy if sys.version_info[0] > 2 : from transport.common import Reader, Writer #, factory else: @@ -44,7 +46,8 @@ class SQLRW : _info['dbname'] = _args['db'] if 'db' in _args else _args['database'] self.table = _args['table'] if 'table' in _args else None self.fields = _args['fields'] if 'fields' in _args else [] - # _provider = _args['provider'] + + self._provider = _args['provider'] if 'provider' in _args else None # _info['host'] = 'localhost' if 'host' not in _args else _args['host'] # _info['port'] = SQLWriter.REFERENCE[_provider]['port'] if 'port' not in _args else _args['port'] @@ -59,7 +62,7 @@ class SQLRW : if 'username' in _args or 'user' in _args: key = 'username' if 'username' in _args else 'user' _info['user'] = _args[key] - _info['password'] = _args['password'] + _info['password'] = _args['password'] if 'password' in _args else '' # # We need to load the drivers here to see what we are dealing with ... @@ -74,17 +77,29 @@ class SQLRW : _info['database'] = _info['dbname'] _info['securityLevel'] = 0 del _info['dbname'] + if _handler == my : + _info['database'] = _info['dbname'] + del _info['dbname'] + self.conn = _handler.connect(**_info) + self._engine = _args['sqlalchemy'] if 'sqlalchemy' in _args else None def has(self,**_args): found = False try: table = _args['table'] sql = "SELECT * FROM :table LIMIT 1".replace(":table",table) - found = pd.read_sql(sql,self.conn).shape[0] + if self._engine : + _conn = self._engine.connect() + else: + _conn = self.conn + found = pd.read_sql(sql,_conn).shape[0] found = True except Exception as e: pass + finally: + if self._engine : + _conn.close() return found def isready(self): _sql = "SELECT * FROM :table LIMIT 1".replace(":table",self.table) @@ -104,7 +119,8 @@ class SQLRW : try: if "select" in _sql.lower() : cursor.close() - return pd.read_sql(_sql,self.conn) + _conn = self._engine.connect() if self._engine else self.conn + return pd.read_sql(_sql,_conn) else: # Executing a command i.e no expected return values ... cursor.execute(_sql) @@ -122,7 +138,8 @@ class SQLRW : pass class SQLReader(SQLRW,Reader) : def __init__(self,**_args): - super().__init__(**_args) + super().__init__(**_args) + def read(self,**_args): if 'sql' in _args : _sql = (_args['sql']) @@ -151,27 +168,47 @@ class SQLWriter(SQLRW,Writer): # NOTE: Proper data type should be set on the target system if their source is unclear. self._inspect = False if 'inspect' not in _args else _args['inspect'] self._cast = False if 'cast' not in _args else _args['cast'] + def init(self,fields=None): if not fields : try: - self.fields = pd.read_sql("SELECT * FROM :table LIMIT 1".replace(":table",self.table),self.conn).columns.tolist() + self.fields = pd.read_sql_query("SELECT * FROM :table LIMIT 1".replace(":table",self.table),self.conn).columns.tolist() finally: pass else: self.fields = fields; - def make(self,fields): - self.fields = fields - - sql = " ".join(["CREATE TABLE",self.table," (", ",".join([ name +' '+ self._dtype for name in fields]),")"]) + def make(self,**_args): + + if 'fields' in _args : + fields = _args['fields'] + sql = " ".join(["CREATE TABLE",self.table," (", ",".join([ name +' '+ self._dtype for name in fields]),")"]) + else: + schema = _args['schema'] + N = len(schema) + _map = _args['map'] if 'map' in _args else {} + sql = [] # ["CREATE TABLE ",_args['table'],"("] + for _item in schema : + _type = _item['type'] + if _type in _map : + _type = _map[_type] + sql = sql + [" " .join([_item['name'], ' ',_type])] + sql = ",".join(sql) + sql = ["CREATE TABLE ",_args['table'],"( ",sql," )"] + sql = " ".join(sql) + # sql = " ".join(["CREATE TABLE",_args['table']," (", ",".join([ schema[i]['name'] +' '+ (schema[i]['type'] if schema[i]['type'] not in _map else _map[schema[i]['type'] ]) for i in range(0,N)]),")"]) cursor = self.conn.cursor() try: + cursor.execute(sql) except Exception as e : print (e) + print (sql) pass finally: - cursor.close() + # cursor.close() + self.conn.commit() + pass def write(self,info): """ :param info writes a list of data to a given set of fields @@ -184,7 +221,7 @@ class SQLWriter(SQLRW,Writer): elif type(info) == dict : _fields = info.keys() elif type(info) == pd.DataFrame : - _fields = info.columns + _fields = info.columns.tolist() # _fields = info.keys() if type(info) == dict else info[0].keys() _fields = list (_fields) @@ -192,12 +229,13 @@ class SQLWriter(SQLRW,Writer): # # @TODO: Use pandas/odbc ? Not sure b/c it requires sqlalchemy # - if type(info) != list : - # - # We are assuming 2 cases i.e dict or pd.DataFrame - info = [info] if type(info) == dict else info.values.tolist() + # if type(info) != list : + # # + # # We are assuming 2 cases i.e dict or pd.DataFrame + # info = [info] if type(info) == dict else info.values.tolist() cursor = self.conn.cursor() try: + _sql = "INSERT INTO :table (:fields) VALUES (:values)".replace(":table",self.table) #.replace(":table",self.table).replace(":fields",_fields) if self._inspect : for _row in info : @@ -223,34 +261,49 @@ class SQLWriter(SQLRW,Writer): pass else: - _fields = ",".join(self.fields) + # _sql = _sql.replace(":fields",_fields) # _sql = _sql.replace(":values",",".join(["%("+name+")s" for name in self.fields])) # _sql = _sql.replace("(:fields)","") - _sql = _sql.replace(":fields",_fields) - values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) - _sql = _sql.replace(":values",values) - if type(info) == pd.DataFrame : - _info = info[self.fields].values.tolist() - elif type(info) == dict : - _info = info.values() - else: - # _info = [] + + # _sql = _sql.replace(":values",values) + # if type(info) == pd.DataFrame : + # _info = info[self.fields].values.tolist() + + # elif type(info) == dict : + # _info = info.values() + # else: + # # _info = [] - _info = pd.DataFrame(info)[self.fields].values.tolist() - # for row in info : - - # if type(row) == dict : - # _info.append( list(row.values())) - cursor.executemany(_sql,_info) + # _info = pd.DataFrame(info)[self.fields].values.tolist() + # _info = pd.DataFrame(info).to_dict(orient='records') + if type(info) == list : + _info = pd.DataFrame(info) + elif type(info) == dict : + _info = pd.DataFrame([info]) + else: + _info = pd.DataFrame(info) + + + if self._engine : + # pd.to_sql(_info,self._engine) + _info.to_sql(self.table,self._engine,if_exists='append',index=False) + else: + _fields = ",".join(self.fields) + _sql = _sql.replace(":fields",_fields) + values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) + _sql = _sql.replace(":values",values) + + cursor.executemany(_sql,_info.values.tolist()) + # cursor.commit() # self.conn.commit() except Exception as e: print(e) pass finally: - self.conn.commit() - cursor.close() + self.conn.commit() + # cursor.close() pass def close(self): try: @@ -265,6 +318,7 @@ class BigQuery: self.path = path self.dtypes = _args['dtypes'] if 'dtypes' in _args else None self.table = _args['table'] if 'table' in _args else None + self.client = bq.Client.from_service_account_json(self.path) def meta(self,**_args): """ This function returns meta data for a given table or query with dataset/table properly formatted @@ -272,16 +326,16 @@ class BigQuery: :param sql sql query to be pulled, """ table = _args['table'] - client = bq.Client.from_service_account_json(self.path) - ref = client.dataset(self.dataset).table(table) - return client.get_table(ref).schema + + ref = self.client.dataset(self.dataset).table(table) + return self.client.get_table(ref).schema def has(self,**_args): found = False try: found = self.meta(**_args) is not None except Exception as e: pass - return found + return found class BQReader(BigQuery,Reader) : def __init__(self,**_args): @@ -304,8 +358,9 @@ class BQReader(BigQuery,Reader) : if (':dataset' in SQL or ':DATASET' in SQL) and self.dataset: SQL = SQL.replace(':dataset',self.dataset).replace(':DATASET',self.dataset) _info = {'credentials':self.credentials,'dialect':'standard'} - return pd.read_gbq(SQL,**_info) if SQL else None - # return pd.read_gbq(SQL,credentials=self.credentials,dialect='standard') if SQL else None + return pd.read_gbq(SQL,**_info) if SQL else None + # return self.client.query(SQL).to_dataframe() if SQL else None + class BQWriter(BigQuery,Writer): lock = Lock() From fd9442c29844d8891070296bc2edea75b244eafd Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 4 Mar 2022 15:00:30 -0600 Subject: [PATCH 080/271] bug fix: write problems, using drivers --- transport/common.py | 2 +- transport/sql.py | 49 ++++++++++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/transport/common.py b/transport/common.py index 6e595ae..f706e52 100644 --- a/transport/common.py +++ b/transport/common.py @@ -41,7 +41,7 @@ class Reader (IO): """ def __init__(self): pass - def meta(self): + def meta(self,**_args): """ This function is intended to return meta-data associated with what has just been read @return object of meta data information associated with the content of the store diff --git a/transport/sql.py b/transport/sql.py index 9ccccdb..34ba4fa 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -46,6 +46,7 @@ class SQLRW : _info['dbname'] = _args['db'] if 'db' in _args else _args['database'] self.table = _args['table'] if 'table' in _args else None self.fields = _args['fields'] if 'fields' in _args else [] + self.schema = _args['schema'] if 'schema' in _args else '' self._provider = _args['provider'] if 'provider' in _args else None # _info['host'] = 'localhost' if 'host' not in _args else _args['host'] @@ -83,10 +84,16 @@ class SQLRW : self.conn = _handler.connect(**_info) self._engine = _args['sqlalchemy'] if 'sqlalchemy' in _args else None + def meta(self,**_args): + return [] + def _tablename(self,name) : + + return self.schema +'.'+name if self.schema not in [None, ''] and '.' not in name else name def has(self,**_args): found = False try: - table = _args['table'] + + table = self._tablename(_args['table']) sql = "SELECT * FROM :table LIMIT 1".replace(":table",table) if self._engine : _conn = self._engine.connect() @@ -172,20 +179,23 @@ class SQLWriter(SQLRW,Writer): def init(self,fields=None): if not fields : try: - self.fields = pd.read_sql_query("SELECT * FROM :table LIMIT 1".replace(":table",self.table),self.conn).columns.tolist() + table = self._tablename(self.table) + self.fields = pd.read_sql_query("SELECT * FROM :table LIMIT 1".replace(":table",table),self.conn).columns.tolist() finally: pass else: self.fields = fields; def make(self,**_args): - + table = self._tablename(self.table) if 'table' not in _args else self._tablename(_args['table']) if 'fields' in _args : - fields = _args['fields'] - sql = " ".join(["CREATE TABLE",self.table," (", ",".join([ name +' '+ self._dtype for name in fields]),")"]) + fields = _args['fields'] + # table = self._tablename(self.table) + sql = " ".join(["CREATE TABLE",table," (", ",".join([ name +' '+ self._dtype for name in fields]),")"]) + print (sql) else: schema = _args['schema'] - N = len(schema) + _map = _args['map'] if 'map' in _args else {} sql = [] # ["CREATE TABLE ",_args['table'],"("] for _item in schema : @@ -194,7 +204,8 @@ class SQLWriter(SQLRW,Writer): _type = _map[_type] sql = sql + [" " .join([_item['name'], ' ',_type])] sql = ",".join(sql) - sql = ["CREATE TABLE ",_args['table'],"( ",sql," )"] + # table = self._tablename(_args['table']) + sql = ["CREATE TABLE ",table,"( ",sql," )"] sql = " ".join(sql) # sql = " ".join(["CREATE TABLE",_args['table']," (", ",".join([ schema[i]['name'] +' '+ (schema[i]['type'] if schema[i]['type'] not in _map else _map[schema[i]['type'] ]) for i in range(0,N)]),")"]) cursor = self.conn.cursor() @@ -235,8 +246,8 @@ class SQLWriter(SQLRW,Writer): # info = [info] if type(info) == dict else info.values.tolist() cursor = self.conn.cursor() try: - - _sql = "INSERT INTO :table (:fields) VALUES (:values)".replace(":table",self.table) #.replace(":table",self.table).replace(":fields",_fields) + table = self._tablename(self.table) + _sql = "INSERT INTO :table (:fields) VALUES (:values)".replace(":table",table) #.replace(":table",self.table).replace(":fields",_fields) if self._inspect : for _row in info : fields = list(_row.keys()) @@ -285,16 +296,18 @@ class SQLWriter(SQLRW,Writer): _info = pd.DataFrame(info) - if self._engine : - # pd.to_sql(_info,self._engine) - _info.to_sql(self.table,self._engine,if_exists='append',index=False) - else: - _fields = ",".join(self.fields) - _sql = _sql.replace(":fields",_fields) - values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) - _sql = _sql.replace(":values",values) + # if self._engine : + # # pd.to_sql(_info,self._engine) + # print (_info.columns.tolist()) + # rows = _info.to_sql(table,self._engine,if_exists='append',index=False) + # print ([rows]) + # else: + _fields = ",".join(self.fields) + _sql = _sql.replace(":fields",_fields) + values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) + _sql = _sql.replace(":values",values) - cursor.executemany(_sql,_info.values.tolist()) + cursor.executemany(_sql,_info.values.tolist()) # cursor.commit() # self.conn.commit() From cfc683c1b342a2d8670660704352c4507eac8284 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 7 Mar 2022 14:17:27 -0600 Subject: [PATCH 081/271] bug fix: transport cli and write function for sql --- bin/transport | 30 ++++++++++++++++++++++++++---- transport/sql.py | 32 ++++++++++++++++---------------- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/bin/transport b/bin/transport index 01c5f71..1df4c03 100755 --- a/bin/transport +++ b/bin/transport @@ -68,11 +68,25 @@ class Post(Process): # # If the table doesn't exists maybe create it ? # - self.rows = args['rows'] + self.rows = args['rows'].fillna('') + def run(self): _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows - + ltypes = self.rows.dtypes.values + columns = self.rows.dtypes.index.tolist() + if not self.writer.has() : + + + self.writer.make(fields=columns) + # self.log(module='write',action='make-table',input={"name":self.writer.table}) + for name in columns : + if _info[name].dtype in ['int32','int64','int','float','float32','float64'] : + value = 0 + else: + value = '' + _info[name] = _info[name].fillna(value) + print (_info) self.writer.write(_info) self.writer.close() @@ -107,6 +121,8 @@ class ETL (Process): else: idf = self.reader.read() idf = pd.DataFrame(idf) + # idf = idf.replace({np.nan: None}, inplace = True) + idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()] self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT) @@ -114,6 +130,8 @@ class ETL (Process): # writing the data to a designated data source # try: + + self.log(module='write',action='partitioning') rows = np.array_split(np.arange(idf.shape[0]),self.JOB_COUNT) # @@ -152,9 +170,13 @@ if __name__ == '__main__' : _config['jobs'] = 10 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs']) etl = ETL (**_config) - etl.start() - procs.append(etl) + if not index : + + etl.start() + procs.append(etl) if index and _info.index(_config) == index : + procs = [etl] + etl.start() break # # diff --git a/transport/sql.py b/transport/sql.py index 34ba4fa..e646a18 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -93,7 +93,7 @@ class SQLRW : found = False try: - table = self._tablename(_args['table']) + table = self._tablename(_args['table'])if 'table' in _args else self._tablename(self.table) sql = "SELECT * FROM :table LIMIT 1".replace(":table",table) if self._engine : _conn = self._engine.connect() @@ -192,9 +192,9 @@ class SQLWriter(SQLRW,Writer): fields = _args['fields'] # table = self._tablename(self.table) sql = " ".join(["CREATE TABLE",table," (", ",".join([ name +' '+ self._dtype for name in fields]),")"]) - print (sql) + else: - schema = _args['schema'] + schema = _args['schema'] if 'schema' in _args else '' _map = _args['map'] if 'map' in _args else {} sql = [] # ["CREATE TABLE ",_args['table'],"("] @@ -214,7 +214,7 @@ class SQLWriter(SQLRW,Writer): cursor.execute(sql) except Exception as e : print (e) - print (sql) + # print (sql) pass finally: # cursor.close() @@ -296,18 +296,18 @@ class SQLWriter(SQLRW,Writer): _info = pd.DataFrame(info) - # if self._engine : - # # pd.to_sql(_info,self._engine) - # print (_info.columns.tolist()) - # rows = _info.to_sql(table,self._engine,if_exists='append',index=False) - # print ([rows]) - # else: - _fields = ",".join(self.fields) - _sql = _sql.replace(":fields",_fields) - values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) - _sql = _sql.replace(":values",values) + if self._engine : + # pd.to_sql(_info,self._engine) - cursor.executemany(_sql,_info.values.tolist()) + rows = _info.to_sql(table,self._engine,if_exists='append',index=False) + + else: + _fields = ",".join(self.fields) + _sql = _sql.replace(":fields",_fields) + values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) + _sql = _sql.replace(":values",values) + + cursor.executemany(_sql,_info.values.tolist()) # cursor.commit() # self.conn.commit() @@ -338,7 +338,7 @@ class BigQuery: :param table name of the name WITHOUT including dataset :param sql sql query to be pulled, """ - table = _args['table'] + table = _args['table'] ref = self.client.dataset(self.dataset).table(table) return self.client.get_table(ref).schema From f672c04844d0277172e677830980cda36e07f41c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 7 Mar 2022 15:29:03 -0600 Subject: [PATCH 082/271] bugfix: sqlalchemy --- transport/sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/sql.py b/transport/sql.py index e646a18..d408942 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -299,7 +299,7 @@ class SQLWriter(SQLRW,Writer): if self._engine : # pd.to_sql(_info,self._engine) - rows = _info.to_sql(table,self._engine,if_exists='append',index=False) + rows = _info.to_sql(table,self._engine,schema=self.schema,if_exists='append',index=False) else: _fields = ",".join(self.fields) From 105ff00224244ec1a0737d0120c70ff91441885a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 7 Mar 2022 18:50:29 -0600 Subject: [PATCH 083/271] bugfix: ETL multiprocessing --- bin/transport | 34 +++++++++------ transport/__init__.py | 2 +- transport/sql.py | 97 +++++++++++++++---------------------------- 3 files changed, 57 insertions(+), 76 deletions(-) diff --git a/bin/transport b/bin/transport index 1df4c03..47979db 100755 --- a/bin/transport +++ b/bin/transport @@ -75,10 +75,10 @@ class Post(Process): _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows ltypes = self.rows.dtypes.values columns = self.rows.dtypes.index.tolist() - if not self.writer.has() : + # if not self.writer.has() : - self.writer.make(fields=columns) + # self.writer.make(fields=columns) # self.log(module='write',action='make-table',input={"name":self.writer.table}) for name in columns : if _info[name].dtype in ['int32','int64','int','float','float32','float64'] : @@ -86,7 +86,7 @@ class Post(Process): else: value = '' _info[name] = _info[name].fillna(value) - print (_info) + self.writer.write(_info) self.writer.close() @@ -94,6 +94,7 @@ class Post(Process): class ETL (Process): def __init__(self,**_args): super().__init__() + self.name = _args['id'] if 'provider' not in _args['source'] : #@deprecate @@ -133,18 +134,24 @@ class ETL (Process): self.log(module='write',action='partitioning') - rows = np.array_split(np.arange(idf.shape[0]),self.JOB_COUNT) + rows = np.array_split(np.arange(0,idf.shape[0]),self.JOB_COUNT) + # # @TODO: locks - for i in rows : - _id = 'segment #'.join([str(rows.index(i)),self.name]) - segment = idf.loc[i,:] #.to_dict(orient='records') + for i in np.arange(self.JOB_COUNT) : + print () + print (i) + _id = 'segment # '.join([str(i),' ',self.name]) + indexes = rows[i] + segment = idf.loc[indexes,:].copy() #.to_dict(orient='records') proc = Post(target = self._oargs,rows = segment,name=_id) self.jobs.append(proc) proc.start() - self.log(module='write',action='working ...',name=self.name) - + self.log(module='write',action='working',segment=_id) + # while poc : + # proc = [job for job in proc if job.is_alive()] + # time.sleep(1) except Exception as e: print (e) @@ -168,13 +175,16 @@ if __name__ == '__main__' : if 'source' in SYS_ARGS : _config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}} - _config['jobs'] = 10 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs']) + _config['jobs'] = 3 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs']) etl = ETL (**_config) - if not index : + if index is None: etl.start() procs.append(etl) - if index and _info.index(_config) == index : + + elif _info.index(_config) == index : + + # print (_config) procs = [etl] etl.start() break diff --git a/transport/__init__.py b/transport/__init__.py index ce5090b..6642c4e 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -162,7 +162,7 @@ def instance(**_args): if provider not in ['mongodb','couchdb','bigquery'] : uri = ''.join([provider,"://",account,host,'/',database]) - e = sqlalchemy.create_engine (uri) + e = sqlalchemy.create_engine (uri,future=True) args['sqlalchemy'] = e # # @TODO: Include handling of bigquery with SQLAlchemy diff --git a/transport/sql.py b/transport/sql.py index d408942..6d44976 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -21,7 +21,7 @@ else: import json from google.oauth2 import service_account from google.cloud import bigquery as bq -from multiprocessing import Lock +from multiprocessing import Lock, RLock import pandas as pd import numpy as np import nzpy as nz #--- netezza drivers @@ -30,7 +30,7 @@ import os class SQLRW : - + lock = RLock() DRIVERS = {"postgresql":pg,"redshift":pg,"mysql":my,"mariadb":my,"netezza":nz} REFERENCE = { "netezza":{"port":5480,"handler":nz,"dtype":"VARCHAR(512)"}, @@ -71,7 +71,7 @@ class SQLRW : # _handler = SQLWriter.REFERENCE[_provider]['handler'] _handler = _args['driver'] #-- handler to the driver self._dtype = _args['default']['type'] if 'default' in _args and 'type' in _args['default'] else 'VARCHAR(256)' - self._provider = _args['provider'] + # self._provider = _args['provider'] # self._dtype = SQLWriter.REFERENCE[_provider]['dtype'] if 'dtype' not in _args else _args['dtype'] # self._provider = _provider if _handler == nz : @@ -173,7 +173,7 @@ class SQLWriter(SQLRW,Writer): # In the advent that data typing is difficult to determine we can inspect and perform a default case # This slows down the process but improves reliability of the data # NOTE: Proper data type should be set on the target system if their source is unclear. - self._inspect = False if 'inspect' not in _args else _args['inspect'] + self._cast = False if 'cast' not in _args else _args['cast'] def init(self,fields=None): @@ -244,78 +244,49 @@ class SQLWriter(SQLRW,Writer): # # # # We are assuming 2 cases i.e dict or pd.DataFrame # info = [info] if type(info) == dict else info.values.tolist() - cursor = self.conn.cursor() + try: table = self._tablename(self.table) _sql = "INSERT INTO :table (:fields) VALUES (:values)".replace(":table",table) #.replace(":table",self.table).replace(":fields",_fields) - if self._inspect : - for _row in info : - fields = list(_row.keys()) - if self._cast == False : - values = ",".join(_row.values()) - else: - # values = "'"+"','".join([str(value) for value in _row.values()])+"'" - values = [",".join(["%(",name,")s"]) for name in _row.keys()] - - # values = [ "".join(["'",str(_row[key]),"'"]) if np.nan(_row[key]).isnumeric() else str(_row[key]) for key in _row] - # print (values) - query = _sql.replace(":fields",",".join(fields)).replace(":values",values) - if type(info) == pd.DataFrame : - _values = info.values.tolist() - elif type(info) == list and type(info[0]) == dict: - print ('........') - _values = [tuple(item.values()) for item in info] - else: - _values = info; - cursor.execute(query,_values) - - - pass + + if type(info) == list : + _info = pd.DataFrame(info) + elif type(info) == dict : + _info = pd.DataFrame([info]) else: - - # _sql = _sql.replace(":fields",_fields) - # _sql = _sql.replace(":values",",".join(["%("+name+")s" for name in self.fields])) - # _sql = _sql.replace("(:fields)","") - - # _sql = _sql.replace(":values",values) - # if type(info) == pd.DataFrame : - # _info = info[self.fields].values.tolist() - - # elif type(info) == dict : - # _info = info.values() - # else: - # # _info = [] - - # _info = pd.DataFrame(info)[self.fields].values.tolist() - # _info = pd.DataFrame(info).to_dict(orient='records') - if type(info) == list : - _info = pd.DataFrame(info) - elif type(info) == dict : - _info = pd.DataFrame([info]) - else: - _info = pd.DataFrame(info) + _info = pd.DataFrame(info) + + if _info.shape[0] == 0 : - if self._engine : - # pd.to_sql(_info,self._engine) - - rows = _info.to_sql(table,self._engine,schema=self.schema,if_exists='append',index=False) - + return + SQLRW.lock.acquire() + if self._engine is not None: + # pd.to_sql(_info,self._engine) + if self.schema in ['',None] : + rows = _info.to_sql(table,self._engine,if_exists='append',index=False) else: - _fields = ",".join(self.fields) - _sql = _sql.replace(":fields",_fields) - values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) - _sql = _sql.replace(":values",values) - - cursor.executemany(_sql,_info.values.tolist()) - # cursor.commit() + rows = _info.to_sql(self.table,self._engine,schema=self.schema,if_exists='append',index=False) + + else: + _fields = ",".join(self.fields) + _sql = _sql.replace(":fields",_fields) + values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) + _sql = _sql.replace(":values",values) + cursor = self.conn.cursor() + cursor.executemany(_sql,_info.values.tolist()) + cursor.close() + # cursor.commit() # self.conn.commit() except Exception as e: print(e) pass finally: - self.conn.commit() + + if self._engine is None : + self.conn.commit() + SQLRW.lock.release() # cursor.close() pass def close(self): From 38e1bce6c2f02a0af33801c338a6bf72741b22e1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 12 Mar 2022 12:25:29 -0600 Subject: [PATCH 084/271] bug fixes and optimizations --- README.md | 37 ++++++++++++++++++++++++++++++++++++- setup.py | 2 +- transport/sql.py | 15 +++++++++------ 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 805fb8f..5ec2113 100644 --- a/README.md +++ b/README.md @@ -35,9 +35,44 @@ Within the virtual environment perform the following : pip install git+https://dev.the-phi.com/git/steve/data-transport.git +Once installed **data-transport** can be used as a library in code or a command line interface (CLI) +## Data Transport as a Library (in code) +--- + +The data-transport can be used within code as a library +* Read/Write against [mongodb](https://github.com/lnyemba/data-transport/wiki/mongodb) +* Read/Write against tranditional [RDBMS](https://github.com/lnyemba/data-transport/wiki/rdbms) +* Read/Write against [bigquery](https://github.com/lnyemba/data-transport/wiki/bigquery) + +The read/write functions make data-transport a great candidate for **data-science**; **data-engineering** or all things pertaining to data. It enables operations across multiple data-stores(relational or not) + +## Command Line Interface (CLI) +--- +The CLI program is called **transport** and it requires a configuratio file + +``` +[ + { + "id":"logs", + "source":{ + "provider":"postgresql","context":"read","database":"mydb", + "cmd":{"sql":"SELECT * FROM logs limit 10"} + }, + "target":{ + "provider":"bigquery","private_key":"/bgqdrive/account/bq-service-account-key.json", + "dataset":"mydataset" + } + }, + +] +``` -## In code (Embedded) +Assuming the above content is stored in a file called **etl-config.json**, we would perform the following in a terminal window: + +``` +[steve@data-transport]$ transport --config ./etl-config.json [--index ] +``` **Reading/Writing Mongodb** diff --git a/setup.py b/setup.py index 066ebc7..c4ca8ea 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ args = { "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pymongo','pandas','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] +args["install_requires"] = ['pymongo','sqlalchemy','pandas','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] if sys.version_info[0] == 2 : diff --git a/transport/sql.py b/transport/sql.py index 6d44976..19d8ecc 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -125,9 +125,9 @@ class SQLRW : _out = None try: if "select" in _sql.lower() : - cursor.close() - _conn = self._engine.connect() if self._engine else self.conn - return pd.read_sql(_sql,_conn) + + # _conn = self._engine if self._engine else self.conn + return pd.read_sql(_sql,self.conn) else: # Executing a command i.e no expected return values ... cursor.execute(_sql) @@ -151,7 +151,8 @@ class SQLReader(SQLRW,Reader) : if 'sql' in _args : _sql = (_args['sql']) else: - _sql = "SELECT :fields FROM "+self.table + table = self.table if self.table is not None else _args['table'] + _sql = "SELECT :fields FROM "+self._tablename(table) if 'filter' in _args : _sql = _sql +" WHERE "+_args['filter'] _fields = '*' if not self.fields else ",".join(self.fields) @@ -220,7 +221,7 @@ class SQLWriter(SQLRW,Writer): # cursor.close() self.conn.commit() pass - def write(self,info): + def write(self,info,**_args): """ :param info writes a list of data to a given set of fields """ @@ -324,7 +325,8 @@ class BQReader(BigQuery,Reader) : def __init__(self,**_args): super().__init__(**_args) - + def apply(self,sql): + self.read(sql=sql) pass def read(self,**_args): SQL = None @@ -359,6 +361,7 @@ class BQWriter(BigQuery,Writer): try: if self.parallel or 'lock' in _args : BQWriter.lock.acquire() + _args['table'] = self.table if 'table' not in _args else _args['table'] self._write(_info,**_args) finally: if self.parallel: From e5fadc64a06200274b0e47bcb67ce94f4ec4854a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 19 Mar 2022 00:02:53 -0500 Subject: [PATCH 085/271] optimizations mongodb --- bin/transport | 14 +++++++------- transport/mongo.py | 18 ++++++++++++++---- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/bin/transport b/bin/transport index 47979db..6680f64 100755 --- a/bin/transport +++ b/bin/transport @@ -63,8 +63,8 @@ class Post(Process): else: self.PROVIDER = args['target']['provider'] args['target']['context'] = 'write' - - self.writer = transport.instance(**args['target']) + self.store = args['target'] + # self.writer = transport.instance(**args['target']) # # If the table doesn't exists maybe create it ? # @@ -86,9 +86,9 @@ class Post(Process): else: value = '' _info[name] = _info[name].fillna(value) - - self.writer.write(_info) - self.writer.close() + writer = transport.factory.instance(**self.store) + writer.write(_info) + writer.close() class ETL (Process): @@ -139,11 +139,11 @@ class ETL (Process): # # @TODO: locks for i in np.arange(self.JOB_COUNT) : - print () - print (i) _id = 'segment # '.join([str(i),' ',self.name]) indexes = rows[i] segment = idf.loc[indexes,:].copy() #.to_dict(orient='records') + if segment.shape[0] == 0 : + continue proc = Post(target = self._oargs,rows = segment,name=_id) self.jobs.append(proc) proc.start() diff --git a/transport/mongo.py b/transport/mongo.py index d1ee9ef..9f4ff11 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -20,7 +20,9 @@ else: from common import Reader, Writer import json import re +from multiprocessing import Lock, RLock class Mongo : + lock = RLock() """ Basic mongodb functions are captured here """ @@ -44,6 +46,7 @@ class Mongo : self.uid = args['doc'] #-- document identifier self.dbname = args['dbname'] if 'dbname' in args else args['db'] self.db = self.client[self.dbname] + self._lock = False if 'lock' not in args else args['lock'] def isready(self): p = self.dbname in self.client.list_database_names() @@ -144,10 +147,17 @@ class MongoWriter(Mongo,Writer): # if type(info) == list : # self.db[self.uid].insert_many(info) # else: - if type(info) == list or type(info) == pd.DataFrame : - self.db[self.uid].insert_many(info if type(info) == list else info.to_dict(orient='records')) - else: - self.db[self.uid].insert_one(info) + try: + + if self._lock : + Mongo.lock.acquire() + if type(info) == list or type(info) == pd.DataFrame : + self.db[self.uid].insert_many(info if type(info) == list else info.to_dict(orient='records')) + else: + self.db[self.uid].insert_one(info) + finally: + if self._lock : + Mongo.lock.release() def set(self,document): """ if no identifier is provided the function will delete the entire collection and set the new document. From 6c406407b21d17ba1c000a775a0fed88431456bb Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 31 Mar 2022 17:13:24 -0500 Subject: [PATCH 086/271] bug fix: ... minor update sqlalchemy --- setup.py | 2 +- transport/__init__.py | 1 + transport/sql.py | 5 +++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index c4ca8ea..971f538 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.4.5", + "version":"1.4.6", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/__init__.py b/transport/__init__.py index 6642c4e..1feca91 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -164,6 +164,7 @@ def instance(**_args): e = sqlalchemy.create_engine (uri,future=True) args['sqlalchemy'] = e + # # @TODO: Include handling of bigquery with SQLAlchemy except Exception as e: diff --git a/transport/sql.py b/transport/sql.py index 19d8ecc..a0893a9 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -195,7 +195,7 @@ class SQLWriter(SQLRW,Writer): sql = " ".join(["CREATE TABLE",table," (", ",".join([ name +' '+ self._dtype for name in fields]),")"]) else: - schema = _args['schema'] if 'schema' in _args else '' + schema = _args['schema'] if 'schema' in _args else [] _map = _args['map'] if 'map' in _args else {} sql = [] # ["CREATE TABLE ",_args['table'],"("] @@ -208,7 +208,7 @@ class SQLWriter(SQLRW,Writer): # table = self._tablename(_args['table']) sql = ["CREATE TABLE ",table,"( ",sql," )"] sql = " ".join(sql) - # sql = " ".join(["CREATE TABLE",_args['table']," (", ",".join([ schema[i]['name'] +' '+ (schema[i]['type'] if schema[i]['type'] not in _map else _map[schema[i]['type'] ]) for i in range(0,N)]),")"]) + cursor = self.conn.cursor() try: @@ -262,6 +262,7 @@ class SQLWriter(SQLRW,Writer): return SQLRW.lock.acquire() + if self._engine is not None: # pd.to_sql(_info,self._engine) if self.schema in ['',None] : From 98eaa99820fd3bfbd82e17088eb996b63b015f71 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 11 Apr 2022 18:34:32 -0500 Subject: [PATCH 087/271] bug fixes and enhancements, added console output(for logging) --- bin/transport | 161 ++++++------------------------------------ setup.py | 4 +- transport/__init__.py | 40 ++++++----- transport/common.py | 24 +++++++ transport/disk.py | 12 ++-- 5 files changed, 78 insertions(+), 163 deletions(-) diff --git a/bin/transport b/bin/transport index 6680f64..7200e72 100755 --- a/bin/transport +++ b/bin/transport @@ -53,148 +53,29 @@ if len(sys.argv) > 1: i += 2 -class Post(Process): - def __init__(self,**args): - super().__init__() - - if 'provider' not in args['target'] : - self.PROVIDER = args['target']['type'] - self.writer = transport.factory.instance(**args['target']) - else: - self.PROVIDER = args['target']['provider'] - args['target']['context'] = 'write' - self.store = args['target'] - # self.writer = transport.instance(**args['target']) - # - # If the table doesn't exists maybe create it ? - # - self.rows = args['rows'].fillna('') - - - def run(self): - _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows - ltypes = self.rows.dtypes.values - columns = self.rows.dtypes.index.tolist() - # if not self.writer.has() : - - - # self.writer.make(fields=columns) - # self.log(module='write',action='make-table',input={"name":self.writer.table}) - for name in columns : - if _info[name].dtype in ['int32','int64','int','float','float32','float64'] : - value = 0 - else: - value = '' - _info[name] = _info[name].fillna(value) - writer = transport.factory.instance(**self.store) - writer.write(_info) - writer.close() - - -class ETL (Process): - def __init__(self,**_args): - super().__init__() - - self.name = _args['id'] - if 'provider' not in _args['source'] : - #@deprecate - self.reader = transport.factory.instance(**_args['source']) - else: - # - # This is the new interface - _args['source']['context'] = 'read' - - self.reader = transport.instance(**_args['source']) - # - # do we have an sql query provided or not .... - # self.sql = _args['source']['sql'] if 'sql' in _args['source'] else None - self.cmd = _args['source']['cmd'] if 'cmd' in _args['source'] else None - self._oargs = _args['target'] #transport.factory.instance(**_args['target']) - self.JOB_COUNT = _args['jobs'] - self.jobs = [] - # self.logger = transport.factory.instance(**_args['logger']) - def log(self,**_args) : - _args['name'] = self.name - print (_args) - def run(self): - if self.cmd : - idf = self.reader.read(**self.cmd) - else: - idf = self.reader.read() - idf = pd.DataFrame(idf) - # idf = idf.replace({np.nan: None}, inplace = True) - - idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()] - self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT) - - # - # writing the data to a designated data source - # +if __name__ == '__main__' : + # + # Load information from the file ... + if 'help' in SYS_ARGS : + print (__doc__) + else: try: + _info = json.loads(open(SYS_ARGS['config']).read()) + if 'index' in SYS_ARGS : + _index = int(SYS_ARGS['index']) + _info = [_item for _item in _info if _info.index(_item) == _index] + pass - - self.log(module='write',action='partitioning') - rows = np.array_split(np.arange(0,idf.shape[0]),self.JOB_COUNT) - - # - # @TODO: locks - for i in np.arange(self.JOB_COUNT) : - _id = 'segment # '.join([str(i),' ',self.name]) - indexes = rows[i] - segment = idf.loc[indexes,:].copy() #.to_dict(orient='records') - if segment.shape[0] == 0 : - continue - proc = Post(target = self._oargs,rows = segment,name=_id) - self.jobs.append(proc) - proc.start() - - self.log(module='write',action='working',segment=_id) - # while poc : - # proc = [job for job in proc if job.is_alive()] - # time.sleep(1) + procs = 1 if 'procs' not in SYS_ARGS else int(SYS_ARGS['procs']) + jobs = transport.factory.instance(provider='etl',info=_info,procs=procs) + while jobs : + x = len(jobs) + jobs = [_job for _job in jobs if _job.is_alive()] + if x != len(jobs) : + print ([len(jobs),'... jobs running']) + time.sleep(1) except Exception as e: - print (e) - - def is_done(self): - self.jobs = [proc for proc in self.jobs if proc.is_alive()] - return len(self.jobs) == 0 -def apply(_args) : - """ - This function will apply a set of commands against a data-store. The expected structure is as follows : - {"store":...,"apply":[]} - """ - handler = transport.factory.instance(**_args['store']) - for cmd in _args['apply'] : - handler.apply(cmd) - handler.close() -if __name__ == '__main__' : - _info = json.loads(open (SYS_ARGS['config']).read()) - index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else None - procs = [] - for _config in _info : - if 'source' in SYS_ARGS : - _config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}} - - _config['jobs'] = 3 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs']) - etl = ETL (**_config) - if index is None: - etl.start() - procs.append(etl) - - elif _info.index(_config) == index : + print (e) - # print (_config) - procs = [etl] - etl.start() - break - # - # - N = len(procs) - while procs : - procs = [thread for thread in procs if not thread.is_done()] - if len(procs) < N : - print (["Finished ",(N-len(procs)), " remaining ", len(procs)]) - N = len(procs) - time.sleep(1) - print ("We're done !!") \ No newline at end of file + \ No newline at end of file diff --git a/setup.py b/setup.py index 971f538..288e3f7 100644 --- a/setup.py +++ b/setup.py @@ -8,12 +8,12 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.4.6", + "version":"1.4.8", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pymongo','sqlalchemy','pandas','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] +args["install_requires"] = ['pymongo','sqlalchemy','pandas','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] if sys.version_info[0] == 2 : diff --git a/transport/__init__.py b/transport/__init__.py index 1feca91..d21e412 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -28,22 +28,24 @@ import importlib import sys import sqlalchemy if sys.version_info[0] > 2 : - from transport.common import Reader, Writer #, factory - from transport import disk + from transport.common import Reader, Writer,Console #, factory + from transport import disk - from transport import s3 as s3 - from transport import rabbitmq as queue - from transport import couch as couch - from transport import mongo as mongo - from transport import sql as sql + from transport import s3 as s3 + from transport import rabbitmq as queue + from transport import couch as couch + from transport import mongo as mongo + from transport import sql as sql + from transport import etl as etl else: - from common import Reader, Writer #, factory - import disk - import queue - import couch - import mongo - import s3 - import sql + from common import Reader, Writer,Console #, factory + import disk + import queue + import couch + import mongo + import s3 + import sql + import etl import psycopg2 as pg import mysql.connector as my from google.cloud import bigquery as bq @@ -51,9 +53,12 @@ import nzpy as nz #--- netezza drivers import os + class factory : TYPE = {"sql":{"providers":["postgresql","mysql","neteeza","bigquery","mariadb","redshift"]}} PROVIDERS = { + "etl":{"class":{"read":etl.instance}}, + "console":{"class":{"write":Console,"read":Console}}, "file":{"class":{"read":disk.DiskReader,"write":disk.DiskWriter}}, "sqlite":{"class":{"read":disk.SQLiteReader,"write":disk.SQLiteWriter}}, "postgresql":{"port":5432,"host":"localhost","database":os.environ['USER'],"driver":pg,"default":{"type":"VARCHAR"}}, @@ -140,8 +145,9 @@ def instance(**_args): # # Let us try to establish an sqlalchemy wrapper try: + host = '' - if provider not in ['bigquery','mongodb','couchdb','sqlite'] : + if provider not in ['bigquery','mongodb','couchdb','sqlite','console','etl','file'] : # # In these cases we are assuming RDBMS and thus would exclude NoSQL and BigQuery username = args['username'] if 'username' in args else '' @@ -159,7 +165,7 @@ def instance(**_args): account = '' host = '' database = args['path'] if 'path' in args else args['database'] - if provider not in ['mongodb','couchdb','bigquery'] : + if provider not in ['mongodb','couchdb','bigquery','console','etl','file'] : uri = ''.join([provider,"://",account,host,'/',database]) e = sqlalchemy.create_engine (uri,future=True) @@ -170,6 +176,6 @@ def instance(**_args): except Exception as e: print (e) - return pointer(**args) + return pointer(**args) return None diff --git a/transport/common.py b/transport/common.py index f706e52..377d9a6 100644 --- a/transport/common.py +++ b/transport/common.py @@ -21,6 +21,7 @@ __author__ = 'The Phi Technology' import numpy as np import json import importlib +from multiprocessing import RLock # import couch # import mongo class IO: @@ -89,6 +90,29 @@ class ReadWriter(Reader,Writer) : This class implements the read/write functions aggregated """ pass +class Console(Writer): + lock = RLock() + def __init__(self,**_args): + self.lock = _args['lock'] if 'lock' in _args else False + self.info = self.write + self.debug = self.write + self.log = self.write + pass + def write (self,info,**_args): + if self.lock : + Console.lock.acquire() + try: + if type(info) == list: + for row in info : + print (row) + else: + print (info) + except Exception as e : + print (e) + finally: + if self.lock : + Console.lock.release() + # class factory : # @staticmethod # def instance(**args): diff --git a/transport/disk.py b/transport/disk.py index 14bb8a0..16e57de 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -21,14 +21,17 @@ class DiskReader(Reader) : """ Reader.__init__(self) - self.path = params['path'] ; + self.path = params['path'] if 'path' in params else None self.delimiter = params['delimiter'] if 'delimiter' in params else ',' + def isready(self): return os.path.exists(self.path) + def meta(self,**_args): + return [] def read(self,**args): _path = self.path if 'path' not in args else args['path'] _delimiter = self.delimiter if 'delimiter' not in args else args['delimiter'] - return pd.read_csv(self.path,delimiter=self.delimiter) + return pd.read_csv(_path,delimiter=self.delimiter) def stream(self,**args): """ This function reads the rows from a designated location on disk @@ -84,15 +87,16 @@ class DiskWriter(Writer): self.cache['meta']['cols'] += len(row) if isinstance(row,list) else len(row.keys()) self.cache['meta']['rows'] += 1 return (self.delimiter.join(row) if self.delimiter else json.dumps(row))+"\n" - def write(self,info): + def write(self,info,**_args): """ This function writes a record to a designated file @param label @param row row to be written """ try: + _mode = 'a' if 'overwrite' not in _args else 'w' DiskWriter.THREAD_LOCK.acquire() - f = open(self.path,'a') + f = open(self.path,_mode) if self.delimiter : if type(info) == list : for row in info : From e9a6779a9b864692da7affcbc77978c1f19e39af Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 11 Apr 2022 23:31:47 -0500 Subject: [PATCH 088/271] new file and documentation --- README.md | 26 +++++- transport/etl.py | 221 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 244 insertions(+), 3 deletions(-) create mode 100644 transport/etl.py diff --git a/README.md b/README.md index 5ec2113..0add2c7 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Introduction -This project implements an abstraction of objects that can have access to a variety of data stores, implementing read/write with a simple and expressive interface. This abstraction works with **NoSQL** and **SQL** data stores and leverages **pandas** +This project implements an abstraction of objects that can have access to a variety of data stores, implementing read/write with a simple and expressive interface. This abstraction works with **NoSQL** and **SQL** data stores and leverages **pandas**. The supported data store providers : @@ -44,12 +44,32 @@ The data-transport can be used within code as a library * Read/Write against [mongodb](https://github.com/lnyemba/data-transport/wiki/mongodb) * Read/Write against tranditional [RDBMS](https://github.com/lnyemba/data-transport/wiki/rdbms) * Read/Write against [bigquery](https://github.com/lnyemba/data-transport/wiki/bigquery) +* ETL CLI/Code [ETL](https://github.com/lnyemba/data-transport/wiki/etl) The read/write functions make data-transport a great candidate for **data-science**; **data-engineering** or all things pertaining to data. It enables operations across multiple data-stores(relational or not) -## Command Line Interface (CLI) +## ETL + +**Embedded in Code** + +It is possible to perform ETL within custom code as follows : + +``` + import transport + import time + + _info = [{source:{'provider':'sqlite','path':'/home/me/foo.csv','table':'me'},target:{provider:'bigquery',private_key='/home/me/key.json','table':'me','dataset':'mydataset'}}, ...] + procs = transport.factory.instance(provider='etl',info=_info) + # + # + while procs: + procs = [pthread for pthread in procs if pthread.is_alive()] + time.sleep(1) +``` + +**Command Line Interface (CLI):** --- -The CLI program is called **transport** and it requires a configuratio file +The CLI program is called **transport** and it requires a configuration file. The program is intended to move data from one location to another. Supported data stores are in the above paragraphs. ``` [ diff --git a/transport/etl.py b/transport/etl.py new file mode 100644 index 0000000..1d5e62d --- /dev/null +++ b/transport/etl.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python +__doc__ = """ +(c) 2018 - 2021 data-transport +steve@the-phi.com, The Phi Technology LLC +https://dev.the-phi.com/git/steve/data-transport.git + +This program performs ETL between 9 supported data sources : Couchdb, Mongodb, Mysql, Mariadb, PostgreSQL, Netezza,Redshift, Sqlite, File +LICENSE (MIT) +Copyright 2016-2020, The Phi Technology LLC + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +Usage : + transport --config --procs +@TODO: Create tables if they don't exist for relational databases +example of configuration : + +1. Move data from a folder to a data-store + transport [--folder ] --config #-- assuming the configuration doesn't have folder + transport --folder --provider -- --table|doc +In this case the configuration should look like : + {folder:..., target:{}} +2. Move data from one source to another + transport --config + {source:{..},target:{..}} or [{source:{..},target:{..}},{source:{..},target:{..}}] + + +""" +import pandas as pd +import numpy as np +import json +import sys +import transport +import time +from multiprocessing import Process +SYS_ARGS = {} +if len(sys.argv) > 1: + + N = len(sys.argv) + for i in range(1,N): + value = None + if sys.argv[i].startswith('--'): + key = sys.argv[i][2:] #.replace('-','') + SYS_ARGS[key] = 1 + if i + 1 < N: + value = sys.argv[i + 1] = sys.argv[i+1].strip() + if key and value and not value.startswith('--'): + SYS_ARGS[key] = value + + + i += 2 + +class Post(Process): + def __init__(self,**args): + super().__init__() + + if 'provider' not in args['target'] : + self.PROVIDER = args['target']['type'] + self.writer = transport.factory.instance(**args['target']) + else: + self.PROVIDER = args['target']['provider'] + args['target']['context'] = 'write' + self.store = args['target'] + # self.writer = transport.instance(**args['target']) + # + # If the table doesn't exists maybe create it ? + # + self.rows = args['rows'].fillna('') + + + def run(self): + _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows + ltypes = self.rows.dtypes.values + columns = self.rows.dtypes.index.tolist() + # if not self.writer.has() : + + + # self.writer.make(fields=columns) + # ETL.logger.info(module='write',action='make-table',input={"name":self.writer.table}) + for name in columns : + if _info[name].dtype in ['int32','int64','int','float','float32','float64'] : + value = 0 + else: + value = '' + _info[name] = _info[name].fillna(value) + writer = transport.factory.instance(**self.store) + writer.write(_info) + writer.close() + + +class ETL (Process): + logger = None + def __init__(self,**_args): + super().__init__() + + self.name = _args['id'] + if 'provider' not in _args['source'] : + #@deprecate + self.reader = transport.factory.instance(**_args['source']) + else: + # + # This is the new interface + _args['source']['context'] = 'read' + + self.reader = transport.instance(**_args['source']) + # + # do we have an sql query provided or not .... + # self.sql = _args['source']['sql'] if 'sql' in _args['source'] else None + self.cmd = _args['source']['cmd'] if 'cmd' in _args['source'] else None + self._oargs = _args['target'] #transport.factory.instance(**_args['target']) + self.JOB_COUNT = _args['jobs'] + self.jobs = [] + # self.logger = transport.factory.instance(**_args['logger']) + def log(self,**_args) : + _args['name'] = self.name + print (_args) + def run(self): + if self.cmd : + idf = self.reader.read(**self.cmd) + else: + idf = self.reader.read() + idf = pd.DataFrame(idf) + # idf = idf.replace({np.nan: None}, inplace = True) + + idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()] + ETL.logger.info(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT) + + # + # writing the data to a designated data source + # + try: + + + ETL.logger.info(module='write',action='partitioning') + rows = np.array_split(np.arange(0,idf.shape[0]),self.JOB_COUNT) + + # + # @TODO: locks + for i in np.arange(self.JOB_COUNT) : + # _id = ' '.join([str(i),' table ',self.name]) + indexes = rows[i] + segment = idf.loc[indexes,:].copy() #.to_dict(orient='records') + if segment.shape[0] == 0 : + continue + proc = Post(target = self._oargs,rows = segment,name=str(i)) + self.jobs.append(proc) + proc.start() + + ETL.logger.info(module='write',action='working',segment=str(id),table=self.name,rows=segment.shape[0]) + # while poc : + # proc = [job for job in proc if job.is_alive()] + # time.sleep(1) + except Exception as e: + print (e) + + def is_done(self): + self.jobs = [proc for proc in self.jobs if proc.is_alive()] + return len(self.jobs) == 0 +def instance(**_args): + """ + :param _info list of objects with {source,target}` + :param logger + """ + logger = _args['logger'] if 'logger' in _args else None + _info = _args['info'] + if logger : + ETL.logger = logger + else: + ETL.logger = transport.factory.instance(provider='console',lock=True) + if type(_info) in [list,dict] : + _config = _info if type(_info) != dict else [_info] + # + # The assumption here is that the objects within the list are {source,target} + jobs = [] + for _item in _info : + + _item['jobs'] = 5 if 'procs' not in _args else int(_args['procs']) + _job = ETL(**_item) + _job.start() + jobs.append(_job) + return jobs + + else: + return None + +if __name__ == '__main__' : + _info = json.loads(open (SYS_ARGS['config']).read()) + index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else None + procs = [] + for _config in _info : + if 'source' in SYS_ARGS : + _config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}} + + _config['jobs'] = 3 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs']) + print (_config) + print () + etl = ETL (**_config) + if index is None: + + etl.start() + procs.append(etl) + + elif _info.index(_config) == index : + + # print (_config) + procs = [etl] + etl.start() + break + # + # + N = len(procs) + while procs : + procs = [thread for thread in procs if not thread.is_done()] + if len(procs) < N : + print (["Finished ",(N-len(procs)), " remaining ", len(procs)]) + N = len(procs) + time.sleep(1) + # print ("We're done !!") \ No newline at end of file From fd9523e99aa0e31392771caf496720c4f7f09353 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 29 Apr 2022 11:15:32 -0500 Subject: [PATCH 089/271] enhancement mongodb --- setup.py | 2 +- transport/mongo.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 288e3f7..f87db4d 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.4.8", + "version":"1.4.10", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/mongo.py b/transport/mongo.py index 9f4ff11..1be36a9 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -137,7 +137,7 @@ class MongoWriter(Mongo,Writer): pass - def write(self,info): + def write(self,info,**_args): """ This function will write to a given collection i.e add a record to a collection (no updates) @param info new record in the collection to be added @@ -148,13 +148,13 @@ class MongoWriter(Mongo,Writer): # self.db[self.uid].insert_many(info) # else: try: - + _uid = self.uid if 'doc' not in _args else _args['doc'] if self._lock : Mongo.lock.acquire() if type(info) == list or type(info) == pd.DataFrame : - self.db[self.uid].insert_many(info if type(info) == list else info.to_dict(orient='records')) + self.db[_uid].insert_many(info if type(info) == list else info.to_dict(orient='records')) else: - self.db[self.uid].insert_one(info) + self.db[_uid].insert_one(info) finally: if self._lock : Mongo.lock.release() From 899339f11fb01394f4c36c274427ff13bea7db71 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 May 2022 14:25:12 -0500 Subject: [PATCH 090/271] bug fix: authentication mongodb --- setup.py | 2 +- transport/mongo.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index f87db4d..67ecfc4 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.4.10", + "version":"1.5.0", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/mongo.py b/transport/mongo.py index 1be36a9..fd2c5b8 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -33,20 +33,24 @@ class Mongo : :username username for authentication :password password for current user """ - host = args['host'] if 'host' in args else 'localhost:27017' + port = str(args['port']) if 'port' in args else '27017' + host = args['host'] if 'host' in args else 'localhost' + host = ":".join([host,port]) #-- Formatting host information here + self.uid = args['doc'] if 'doc' in args else None #-- document identifier + self.dbname = args['dbname'] if 'dbname' in args else args['db'] + self._lock = False if 'lock' not in args else args['lock'] + if 'user' in args and 'password' in args: self.client = MongoClient(host, username=args['username'] , password=args['password'] , + authSource=(args['authSource'] if 'authSource' in args else self.dbname), authMechanism='SCRAM-SHA-256') else: self.client = MongoClient(host,maxPoolSize=10000) - self.uid = args['doc'] #-- document identifier - self.dbname = args['dbname'] if 'dbname' in args else args['db'] self.db = self.client[self.dbname] - self._lock = False if 'lock' not in args else args['lock'] def isready(self): p = self.dbname in self.client.list_database_names() From 67cb7de8617beb95d4c5cc77bdf68431bc108418 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 May 2022 14:30:59 -0500 Subject: [PATCH 091/271] bug fix: logging with ETL --- transport/etl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transport/etl.py b/transport/etl.py index 1d5e62d..55e8ef6 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -96,7 +96,7 @@ class ETL (Process): def __init__(self,**_args): super().__init__() - self.name = _args['id'] + self.name = _args['id'] if 'id' in _args else 'UNREGISTERED' if 'provider' not in _args['source'] : #@deprecate self.reader = transport.factory.instance(**_args['source']) @@ -126,7 +126,7 @@ class ETL (Process): # idf = idf.replace({np.nan: None}, inplace = True) idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()] - ETL.logger.info(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT) + # ETL.logger.info(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT) # # writing the data to a designated data source @@ -134,7 +134,7 @@ class ETL (Process): try: - ETL.logger.info(module='write',action='partitioning') + # ETL.logger.info(module='write',action='partitioning') rows = np.array_split(np.arange(0,idf.shape[0]),self.JOB_COUNT) # @@ -149,7 +149,7 @@ class ETL (Process): self.jobs.append(proc) proc.start() - ETL.logger.info(module='write',action='working',segment=str(id),table=self.name,rows=segment.shape[0]) + # ETL.logger.info(module='write',action='working',segment=str(id),table=self.name,rows=segment.shape[0]) # while poc : # proc = [job for job in proc if job.is_alive()] # time.sleep(1) From 8cd34d902ae70868e1a2bb68f9f05169906c32c4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 11 May 2022 11:17:27 -0500 Subject: [PATCH 092/271] bug fix: ETL logging and rabbitmq-server listener --- transport/__init__.py | 10 +++-- transport/common.py | 8 ++-- transport/etl.py | 92 ++++++++++++++++++++++--------------------- transport/rabbitmq.py | 23 ++++++----- transport/sql.py | 8 ++-- 5 files changed, 74 insertions(+), 67 deletions(-) diff --git a/transport/__init__.py b/transport/__init__.py index d21e412..6822138 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -68,11 +68,13 @@ class factory : "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my}, "mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}}, "couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}}, - "netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"}}} + "netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"}}, + "rabbitmq":{"port":5672,"host":"localhost","class":{"read":queue.QueueReader,"write":queue.QueueWriter,"listen":queue.QueueListener},"default":{"type":"application/json"}}} # # creating synonyms PROVIDERS['mongodb'] = PROVIDERS['mongo'] PROVIDERS['couchdb'] = PROVIDERS['couch'] + PROVIDERS['bq'] = PROVIDERS['bigquery'] PROVIDERS['sqlite3'] = PROVIDERS['sqlite'] @staticmethod @@ -124,7 +126,7 @@ def instance(**_args): provider = _args['provider'] context = _args['context']if 'context' in _args else None - _id = context if context in ['read','write'] else 'read' + _id = context if context in list(factory.PROVIDERS[provider]['class'].keys()) else 'read' if _id : args = {'provider':_id} for key in factory.PROVIDERS[provider] : @@ -147,7 +149,7 @@ def instance(**_args): try: host = '' - if provider not in ['bigquery','mongodb','couchdb','sqlite','console','etl','file'] : + if provider not in ['bigquery','mongodb','couchdb','sqlite','console','etl','file','rabbitmq'] : # # In these cases we are assuming RDBMS and thus would exclude NoSQL and BigQuery username = args['username'] if 'username' in args else '' @@ -165,7 +167,7 @@ def instance(**_args): account = '' host = '' database = args['path'] if 'path' in args else args['database'] - if provider not in ['mongodb','couchdb','bigquery','console','etl','file'] : + if provider not in ['mongodb','couchdb','bigquery','console','etl','file','rabbitmq'] : uri = ''.join([provider,"://",account,host,'/',database]) e = sqlalchemy.create_engine (uri,future=True) diff --git a/transport/common.py b/transport/common.py index 377d9a6..a41e46b 100644 --- a/transport/common.py +++ b/transport/common.py @@ -98,15 +98,15 @@ class Console(Writer): self.debug = self.write self.log = self.write pass - def write (self,info,**_args): + def write (self,**_args): if self.lock : Console.lock.acquire() try: - if type(info) == list: - for row in info : + if type(_args) == list: + for row in _args : print (row) else: - print (info) + print (_args) except Exception as e : print (e) finally: diff --git a/transport/etl.py b/transport/etl.py index 55e8ef6..6783cc6 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -54,41 +54,46 @@ if len(sys.argv) > 1: i += 2 class Post(Process): - def __init__(self,**args): - super().__init__() - - if 'provider' not in args['target'] : - self.PROVIDER = args['target']['type'] - self.writer = transport.factory.instance(**args['target']) - else: - self.PROVIDER = args['target']['provider'] - args['target']['context'] = 'write' - self.store = args['target'] - # self.writer = transport.instance(**args['target']) - # - # If the table doesn't exists maybe create it ? - # - self.rows = args['rows'].fillna('') + def __init__(self,**args): + super().__init__() + + if 'provider' not in args['target'] : + self.PROVIDER = args['target']['type'] + self.writer = transport.factory.instance(**args['target']) + else: + self.PROVIDER = args['target']['provider'] + args['target']['context'] = 'write' + self.store = args['target'] + self.store['lock'] = True + # self.writer = transport.instance(**args['target']) + # + # If the table doesn't exists maybe create it ? + # + self.rows = args['rows'].fillna('') + def log(self,**_args) : + if ETL.logger : + ETL.logger.info(**_args) - def run(self): - _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows - ltypes = self.rows.dtypes.values - columns = self.rows.dtypes.index.tolist() - # if not self.writer.has() : + def run(self): + _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows + ltypes = self.rows.dtypes.values + columns = self.rows.dtypes.index.tolist() + # if not self.writer.has() : - - # self.writer.make(fields=columns) - # ETL.logger.info(module='write',action='make-table',input={"name":self.writer.table}) - for name in columns : - if _info[name].dtype in ['int32','int64','int','float','float32','float64'] : - value = 0 - else: - value = '' - _info[name] = _info[name].fillna(value) - writer = transport.factory.instance(**self.store) - writer.write(_info) - writer.close() + + # self.writer.make(fields=columns) + # ETL.logger.info(module='write',action='make-table',input={"name":self.writer.table}) + self.log(module='write',action='make-table',input={"schema":columns}) + for name in columns : + if _info[name].dtype in ['int32','int64','int','float','float32','float64'] : + value = 0 + else: + value = '' + _info[name] = _info[name].fillna(value) + writer = transport.factory.instance(**self.store) + writer.write(_info) + writer.close() class ETL (Process): @@ -115,8 +120,9 @@ class ETL (Process): self.jobs = [] # self.logger = transport.factory.instance(**_args['logger']) def log(self,**_args) : - _args['name'] = self.name - print (_args) + if ETL.logger : + ETL.logger.info(**_args) + def run(self): if self.cmd : idf = self.reader.read(**self.cmd) @@ -126,7 +132,7 @@ class ETL (Process): # idf = idf.replace({np.nan: None}, inplace = True) idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()] - # ETL.logger.info(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT) + self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT) # # writing the data to a designated data source @@ -134,7 +140,7 @@ class ETL (Process): try: - # ETL.logger.info(module='write',action='partitioning') + self.log(module='write',action='partitioning',jobs=self.JOB_COUNT) rows = np.array_split(np.arange(0,idf.shape[0]),self.JOB_COUNT) # @@ -148,10 +154,10 @@ class ETL (Process): proc = Post(target = self._oargs,rows = segment,name=str(i)) self.jobs.append(proc) proc.start() - - # ETL.logger.info(module='write',action='working',segment=str(id),table=self.name,rows=segment.shape[0]) - # while poc : - # proc = [job for job in proc if job.is_alive()] + + self.log(module='write',action='working',segment=str(self.name),table=self.name,rows=segment.shape[0]) + # while self.jobs : + # jobs = [job for job in proc if job.is_alive()] # time.sleep(1) except Exception as e: print (e) @@ -166,9 +172,9 @@ def instance(**_args): """ logger = _args['logger'] if 'logger' in _args else None _info = _args['info'] - if logger : + if logger and type(logger) != str: ETL.logger = logger - else: + elif logger == 'console': ETL.logger = transport.factory.instance(provider='console',lock=True) if type(_info) in [list,dict] : _config = _info if type(_info) != dict else [_info] @@ -195,8 +201,6 @@ if __name__ == '__main__' : _config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}} _config['jobs'] = 3 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs']) - print (_config) - print () etl = ETL (**_config) if index is None: diff --git a/transport/rabbitmq.py b/transport/rabbitmq.py index 41d016a..68c5c5b 100644 --- a/transport/rabbitmq.py +++ b/transport/rabbitmq.py @@ -222,22 +222,21 @@ class QueueListener(MessageQueue): def __init__(self,**args): MessageQueue.__init__(self,**args) self.listen = self.read - # def init(self,qid): - # properties = pika.ConnectionParameters(host=self.host) - # self.connection = pika.BlockingConnection(properties) - # self.channel = self.connection.channel() - # self.channel.exchange_declare(exchange=self.exchange,type='direct',durable=True ) - - # self.info = self.channel.queue_declare(passive=True,exclusive=True,queue=qid) - - # self.channel.queue_bind(exchange=self.exchange,queue=self.info.method.queue,routing_key=qid) - #self.callback = callback + self.apply = args['apply'] if 'apply' in args else print def finalize(self,channel,ExceptionReason): pass - + def callback(self,channel,method,header,stream) : - raise Exception("....") + _info= {} + # if re.match("^\{|\[",stream) is not None: + + if stream.startswith(b"[") or stream.startswith(b"{"): + _info = json.loads(stream) + else: + + _info = stream + self.apply(_info) def read(self): self.init(self.queue) diff --git a/transport/sql.py b/transport/sql.py index a0893a9..52a676c 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -312,9 +312,11 @@ class BigQuery: :param sql sql query to be pulled, """ table = _args['table'] - - ref = self.client.dataset(self.dataset).table(table) - return self.client.get_table(ref).schema + try: + ref = self.client.dataset(self.dataset).table(table) + return self.client.get_table(ref).schema + except Exception as e: + return [] def has(self,**_args): found = False try: From f64b945245688f88de13410eb220ab7d6adcab95 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 16 May 2022 11:27:36 -0500 Subject: [PATCH 093/271] new: authentication using file on disk, misc bug fixes --- setup.py | 2 +- transport/__init__.py | 16 ++++---- transport/common.py | 29 --------------- transport/mongo.py | 18 +++++++-- transport/rabbitmq.py | 85 +++++++++++++++++++++++++++++-------------- transport/session.py | 26 ++++++------- transport/sql.py | 11 ++++++ 7 files changed, 106 insertions(+), 81 deletions(-) diff --git a/setup.py b/setup.py index 67ecfc4..dd4b292 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.5.0", + "version":"1.5.2", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/__init__.py b/transport/__init__.py index 6822138..86d7fce 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -57,25 +57,27 @@ import os class factory : TYPE = {"sql":{"providers":["postgresql","mysql","neteeza","bigquery","mariadb","redshift"]}} PROVIDERS = { - "etl":{"class":{"read":etl.instance}}, + "etl":{"class":{"read":etl.instance,"write":etl.instance}}, "console":{"class":{"write":Console,"read":Console}}, "file":{"class":{"read":disk.DiskReader,"write":disk.DiskWriter}}, "sqlite":{"class":{"read":disk.SQLiteReader,"write":disk.SQLiteWriter}}, - "postgresql":{"port":5432,"host":"localhost","database":os.environ['USER'],"driver":pg,"default":{"type":"VARCHAR"}}, - "redshift":{"port":5432,"host":"localhost","database":os.environ['USER'],"driver":pg,"default":{"type":"VARCHAR"}}, + "postgresql":{"port":5432,"host":"localhost","database":os.environ['USER'],"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, + "redshift":{"port":5432,"host":"localhost","database":os.environ['USER'],"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, "bigquery":{"class":{"read":sql.BQReader,"write":sql.BQWriter}}, - "mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my}, - "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my}, + "mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my,"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, + "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my,"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, "mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}}, "couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}}, - "netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"}}, - "rabbitmq":{"port":5672,"host":"localhost","class":{"read":queue.QueueReader,"write":queue.QueueWriter,"listen":queue.QueueListener},"default":{"type":"application/json"}}} + "netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, + "rabbitmq":{"port":5672,"host":"localhost","class":{"read":queue.QueueReader,"write":queue.QueueWriter,"listen":queue.QueueListener,"listener":queue.QueueListener},"default":{"type":"application/json"}}} # # creating synonyms PROVIDERS['mongodb'] = PROVIDERS['mongo'] PROVIDERS['couchdb'] = PROVIDERS['couch'] PROVIDERS['bq'] = PROVIDERS['bigquery'] PROVIDERS['sqlite3'] = PROVIDERS['sqlite'] + PROVIDERS['rabbit'] = PROVIDERS['rabbitmq'] + PROVIDERS['rabbitmq-server'] = PROVIDERS['rabbitmq'] @staticmethod def instance(**_args): diff --git a/transport/common.py b/transport/common.py index a41e46b..e6578a6 100644 --- a/transport/common.py +++ b/transport/common.py @@ -113,32 +113,3 @@ class Console(Writer): if self.lock : Console.lock.release() -# class factory : -# @staticmethod -# def instance(**args): -# """ -# This class will create an instance of a transport when providing -# :type name of the type we are trying to create -# :args The arguments needed to create the instance -# """ -# source = args['type'] -# params = args['args'] -# anObject = None - -# if source in ['HttpRequestReader','HttpSessionWriter']: -# # -# # @TODO: Make sure objects are serializable, be smart about them !! -# # -# aClassName = ''.join([source,'(**params)']) - - -# else: - -# stream = json.dumps(params) -# aClassName = ''.join([source,'(**',stream,')']) -# try: -# anObject = eval( aClassName) -# #setattr(anObject,'name',source) -# except Exception,e: -# print ['Error ',e] -# return anObject \ No newline at end of file diff --git a/transport/mongo.py b/transport/mongo.py index fd2c5b8..8f593c3 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -41,11 +41,21 @@ class Mongo : self._lock = False if 'lock' not in args else args['lock'] - if 'user' in args and 'password' in args: + username = password = None + if 'username' in args and 'password' in args: + username = args['username'] + password=args['password'] + if 'auth_file' in args : + _info = json.loads((open(args['auth_file'])).read()) + username = _info['username'] + password = _info['password'] + authSource=(args['authSource'] if 'authSource' in args else self.dbname) + + if username and password : self.client = MongoClient(host, - username=args['username'] , - password=args['password'] , - authSource=(args['authSource'] if 'authSource' in args else self.dbname), + username=username, + password=password , + authSource=authSource, authMechanism='SCRAM-SHA-256') else: self.client = MongoClient(host,maxPoolSize=10000) diff --git a/transport/rabbitmq.py b/transport/rabbitmq.py index 68c5c5b..a56393b 100644 --- a/transport/rabbitmq.py +++ b/transport/rabbitmq.py @@ -16,7 +16,7 @@ if sys.version_info[0] > 2 : else: from common import Reader, Writer import json - +from multiprocessing import RLock class MessageQueue: """ This class hierarchy is designed to handle interactions with a queue server using pika framework (our tests are based on rabbitmq) @@ -29,12 +29,23 @@ class MessageQueue: self.port= 5672 if 'port' not in params else params['port'] self.virtual_host = '/' if 'vhost' not in params else params['vhost'] self.exchange = params['exchange'] if 'exchange' in params else 'amq.direct' #-- exchange - self.queue = params['queue'] + self.queue = params['queue'] if 'queue' in params else 'demo' self.connection = None self.channel = None - self.name = self.__class__.__name__.lower() if 'name' not in params else 'wtf' + self.name = self.__class__.__name__.lower() if 'name' not in params else params['name'] + username = password = None + if 'username' in params : + username = params['username'] + password = params['password'] + if 'auth_file' in params : + _info = json.loads((open(params['auth_file'])).read()) + username=_info['username'] + password=_info['password'] + self.virtual_host = _info['virtual_host'] if 'virtual_host' in _info else self.virtual_host + self.exchange = _info['exchange'] if 'exchange' in _info else self.exchange + self.queue = _info['queue'] if 'queue' in _info else self.queue self.credentials= pika.PlainCredentials('guest','guest') if 'username' in params : @@ -44,7 +55,9 @@ class MessageQueue: ) def init(self,label=None): - properties = pika.ConnectionParameters(host=self.host,port=self.port,virtual_host=self.virtual_host,credentials=self.credentials) + properties = pika.ConnectionParameters(host=self.host,port=self.port,virtual_host=self.virtual_host, + client_properties={'connection_name':self.name}, + credentials=self.credentials) self.connection = pika.BlockingConnection(properties) self.channel = self.connection.channel() self.info = self.channel.exchange_declare(exchange=self.exchange,exchange_type='direct',durable=True) @@ -93,23 +106,7 @@ class QueueWriter(MessageQueue,Writer): @param object object to be written (will be converted to JSON) @TODO: make this less chatty """ - # xchar = None - # if 'xchar' in params: - # xchar = params['xchar'] - # object = self.format(params['row'],xchar) - - # label = params['label'] - # self.init(label) - # _mode = 2 - # if isinstance(object,str): - # stream = object - # _type = 'text/plain' - # else: - # stream = json.dumps(object) - # if 'type' in params : - # _type = params['type'] - # else: - # _type = 'application/json' + stream = json.dumps(data) if isinstance(data,dict) else data self.channel.basic_publish( exchange=self.exchange, @@ -143,10 +140,11 @@ class QueueReader(MessageQueue,Reader): #self.queue = params['qid'] MessageQueue.__init__(self,**params); # self.init() - if 'durable' in params : - self.durable = True - else: - self.durable = False + self.durable = False if 'durable' not in params else params['durable'] + # if 'durable' in params : + # self.durable = True + # else: + # self.durable = False self.size = -1 self.data = {} # def init(self,qid): @@ -166,7 +164,8 @@ class QueueReader(MessageQueue,Reader): """ r = [] - if re.match("^\{|\[",stream) is not None: + # if re.match("^\{|\[",stream) is not None: + if stream.startswith(b'{') or stream.startswith(b'['): r = json.loads(stream) else: @@ -215,6 +214,7 @@ class QueueReader(MessageQueue,Reader): return self.data class QueueListener(MessageQueue): + lock = RLock() """ This class is designed to have an active listener (worker) against a specified Exchange/Queue It is initialized as would any other object and will require a callback function to address the objects returned. @@ -223,6 +223,7 @@ class QueueListener(MessageQueue): MessageQueue.__init__(self,**args) self.listen = self.read self.apply = args['apply'] if 'apply' in args else print + self.lock = False if 'lock' not in args else args['lock'] def finalize(self,channel,ExceptionReason): pass @@ -231,12 +232,30 @@ class QueueListener(MessageQueue): _info= {} # if re.match("^\{|\[",stream) is not None: + if stream.startswith(b"[") or stream.startswith(b"{"): _info = json.loads(stream) else: _info = stream - self.apply(_info) + # + # At this point we should invoke the apply function with a lock if need be + # @TODO: Establish a vocabulary + + if stream == b'QUIT' : + # channel.exit() + self.close() + if self.lock == True : + QueueListener.lock.acquire() + try: + # + # In case the user has not specified a function to apply the data against, it will simply be printed + # + self.apply(_info) + except Exception as e: + pass + if self.lock == True : + QueueListener.lock.release() def read(self): self.init(self.queue) @@ -246,3 +265,15 @@ class QueueListener(MessageQueue): +class Factory : + @staticmethod + def instance(**_args): + """ + :param count number of workers + :param apply function workers + """ + _apply = _args['apply'] + _count = _args['count'] + for i in np.arange(_count) : + _name = _args['name'] if 'name' in _args else 'worker_'+str(i) + transport.factory.instance(provider="rabbit",context="listener",apply=_apply,auth_file=_args['auth_file']) \ No newline at end of file diff --git a/transport/session.py b/transport/session.py index 5ca833a..915d2b5 100644 --- a/transport/session.py +++ b/transport/session.py @@ -5,11 +5,11 @@ from common import Reader, Writer import json class HttpRequestReader(Reader): - """ - This class is designed to read data from an Http request file handler provided to us by flask - The file will be heald in memory and processed accordingly - NOTE: This is inefficient and can crash a micro-instance (becareful) - """ + """ + This class is designed to read data from an Http request file handler provided to us by flask + The file will be heald in memory and processed accordingly + NOTE: This is inefficient and can crash a micro-instance (becareful) + """ def __init__(self,**params): self.file_length = 0 @@ -22,8 +22,8 @@ class HttpRequestReader(Reader): #print 'size of file ',self.file_length self.content = params['file'].readlines() self.file_length = len(self.content) - except Exception, e: - print "Error ... ",e + except Exception as e: + print ("Error ... ",e) pass def isready(self): @@ -37,13 +37,13 @@ class HttpRequestReader(Reader): yield row class HttpSessionWriter(Writer): - """ - This class is designed to write data to a session/cookie - """ + """ + This class is designed to write data to a session/cookie + """ def __init__(self,**params): - """ - @param key required session key - """ + """ + @param key required session key + """ self.session = params['queue'] self.session['sql'] = [] self.session['csv'] = [] diff --git a/transport/sql.py b/transport/sql.py index 52a676c..d2b0b36 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -64,6 +64,17 @@ class SQLRW : key = 'username' if 'username' in _args else 'user' _info['user'] = _args[key] _info['password'] = _args['password'] if 'password' in _args else '' + if 'auth_file' in _args : + _auth = json.loads( open(_args['auth_file']).read() ) + key = 'username' if 'username' in _auth else 'user' + _info['user'] = _auth[key] + _info['password'] = _auth['password'] if 'password' in _auth else '' + + _info['host'] = _auth['host'] if 'host' in _auth else _info['host'] + _info['port'] = _auth['port'] if 'port' in _auth else _info['port'] + if 'database' in _auth: + _info['dbname'] = _auth['database'] + self.table = _auth['table'] if 'table' in _auth else self.table # # We need to load the drivers here to see what we are dealing with ... From db1496dd38ac02d4d2b20f96bd663485f74ca991 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 16 May 2022 11:54:28 -0500 Subject: [PATCH 094/271] bug fixes --- transport/__init__.py | 4 ++-- transport/mongo.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/transport/__init__.py b/transport/__init__.py index 86d7fce..15f8f8d 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -151,7 +151,7 @@ def instance(**_args): try: host = '' - if provider not in ['bigquery','mongodb','couchdb','sqlite','console','etl','file','rabbitmq'] : + if provider not in ['bigquery','mongodb','mongo','couchdb','sqlite','console','etl','file','rabbitmq'] : # # In these cases we are assuming RDBMS and thus would exclude NoSQL and BigQuery username = args['username'] if 'username' in args else '' @@ -169,7 +169,7 @@ def instance(**_args): account = '' host = '' database = args['path'] if 'path' in args else args['database'] - if provider not in ['mongodb','couchdb','bigquery','console','etl','file','rabbitmq'] : + if provider not in ['mongodb','mongo','couchdb','bigquery','console','etl','file','rabbitmq'] : uri = ''.join([provider,"://",account,host,'/',database]) e = sqlalchemy.create_engine (uri,future=True) diff --git a/transport/mongo.py b/transport/mongo.py index 8f593c3..7cb89c3 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -38,7 +38,7 @@ class Mongo : host = ":".join([host,port]) #-- Formatting host information here self.uid = args['doc'] if 'doc' in args else None #-- document identifier self.dbname = args['dbname'] if 'dbname' in args else args['db'] - + authMechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism'] self._lock = False if 'lock' not in args else args['lock'] username = password = None @@ -49,6 +49,9 @@ class Mongo : _info = json.loads((open(args['auth_file'])).read()) username = _info['username'] password = _info['password'] + if 'mechanism' in _info: + authMechanism = _info['mechanism'] + authSource=(args['authSource'] if 'authSource' in args else self.dbname) if username and password : From 7c328e510665ac8bf04386178d06e1be6c41d9c6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 16 May 2022 11:59:46 -0500 Subject: [PATCH 095/271] bug fix: mongodb mechanism parameter for authentication --- transport/mongo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/mongo.py b/transport/mongo.py index 7cb89c3..eb7b75d 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -59,7 +59,7 @@ class Mongo : username=username, password=password , authSource=authSource, - authMechanism='SCRAM-SHA-256') + authMechanism=authMechanism) else: self.client = MongoClient(host,maxPoolSize=10000) From 40bcfc40f95394c5accdddce4f7c6120f51c602d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 16 May 2022 13:14:29 -0500 Subject: [PATCH 096/271] bug fix: authentication --- transport/mongo.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/transport/mongo.py b/transport/mongo.py index eb7b75d..f6fa12b 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -39,6 +39,7 @@ class Mongo : self.uid = args['doc'] if 'doc' in args else None #-- document identifier self.dbname = args['dbname'] if 'dbname' in args else args['db'] authMechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism'] + authSource=(args['authSource'] if 'authSource' in args else self.dbname) self._lock = False if 'lock' not in args else args['lock'] username = password = None @@ -51,8 +52,9 @@ class Mongo : password = _info['password'] if 'mechanism' in _info: authMechanism = _info['mechanism'] + if 'authSource' in _info: + authSource = _info['authSource'] - authSource=(args['authSource'] if 'authSource' in args else self.dbname) if username and password : self.client = MongoClient(host, From b2a224f4a7e6322329b2abc6d465704bc11e4b97 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 10 Jun 2022 10:11:37 -0500 Subject: [PATCH 097/271] bug fix: sql lock for parallel processing --- setup.py | 2 +- transport/sql.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index dd4b292..ec4edf5 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.5.2", + "version":"1.5.3", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/sql.py b/transport/sql.py index d2b0b36..282938f 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -59,7 +59,7 @@ class SQLRW : # _info['host'] = 'localhost' if 'host' not in _args else _args['host'] # # _info['port'] = SQLWriter.PROVIDERS[_args['provider']] if 'port' not in _args else _args['port'] # _info['port'] = SQLWriter.REFERENCE[_provider]['port'] if 'port' not in _args else _args['port'] - + self.lock = False if 'lock' not in _args else _args['lock'] if 'username' in _args or 'user' in _args: key = 'username' if 'username' in _args else 'user' _info['user'] = _args[key] @@ -272,7 +272,8 @@ class SQLWriter(SQLRW,Writer): if _info.shape[0] == 0 : return - SQLRW.lock.acquire() + if self.lock : + SQLRW.lock.acquire() if self._engine is not None: # pd.to_sql(_info,self._engine) @@ -299,7 +300,8 @@ class SQLWriter(SQLRW,Writer): if self._engine is None : self.conn.commit() - SQLRW.lock.release() + if self.lock : + SQLRW.lock.release() # cursor.close() pass def close(self): From 6b78a82e99996428c715567747bdceb48d2ee34c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 10 Jun 2022 10:27:44 -0500 Subject: [PATCH 098/271] bug fix --- transport/sql.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transport/sql.py b/transport/sql.py index 282938f..fe29e7b 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -258,7 +258,8 @@ class SQLWriter(SQLRW,Writer): # info = [info] if type(info) == dict else info.values.tolist() try: - table = self._tablename(self.table) + table = _args['table'] if 'table' in _args else self.table + table = self._tablename(table) _sql = "INSERT INTO :table (:fields) VALUES (:values)".replace(":table",table) #.replace(":table",self.table).replace(":fields",_fields) if type(info) == list : From 9805264f87efaf97c5876764b4be58466de44bf0 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 12 Jun 2022 20:05:03 -0500 Subject: [PATCH 099/271] bug fix: meta data --- setup.py | 2 +- transport/sql.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ec4edf5..897a57c 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.5.3", + "version":"1.5.4", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/sql.py b/transport/sql.py index fe29e7b..490f959 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -96,7 +96,15 @@ class SQLRW : self.conn = _handler.connect(**_info) self._engine = _args['sqlalchemy'] if 'sqlalchemy' in _args else None def meta(self,**_args): - return [] + schema = [] + try: + if self._engine : + table = _args['table'] if 'table' in _args else self.table + _m = sqlalchemy.MetaData(bind=self._engine) + schema = [{"name":_attr.name,"type":str(_attr.type)} for _attr in _m.tables[table].columns] + except Exception as e: + e + return schema def _tablename(self,name) : return self.schema +'.'+name if self.schema not in [None, ''] and '.' not in name else name From d7c9e410958f507c5d2be1aea4d55740785ed9f2 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 12 Jun 2022 21:14:46 -0500 Subject: [PATCH 100/271] bug fix: meta/sql.py --- transport/sql.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transport/sql.py b/transport/sql.py index 490f959..50ba1ed 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -101,9 +101,10 @@ class SQLRW : if self._engine : table = _args['table'] if 'table' in _args else self.table _m = sqlalchemy.MetaData(bind=self._engine) + _m.reflect() schema = [{"name":_attr.name,"type":str(_attr.type)} for _attr in _m.tables[table].columns] except Exception as e: - e + pass return schema def _tablename(self,name) : From 3494da8c6e983ceda74741c78a66c10824535ffa Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 12 Jun 2022 21:16:27 -0500 Subject: [PATCH 101/271] bug fix: meta/sql.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 897a57c..ad835f2 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.5.4", + "version":"1.5.5", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} From 8ca04e8c48a86956d709db854863cb55a2c19956 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 25 Jun 2022 13:59:49 -0500 Subject: [PATCH 102/271] bug fix: environment variable on default database --- setup.py | 2 +- transport/__init__.py | 4 +-- transport/common.py | 2 +- transport/mongo.py | 77 +++++++++++++++++++++++++++++-------------- 4 files changed, 57 insertions(+), 28 deletions(-) diff --git a/setup.py b/setup.py index ad835f2..e4b84a5 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.5.5", + "version":"1.5.6", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/__init__.py b/transport/__init__.py index 15f8f8d..2f6f5ac 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -61,8 +61,8 @@ class factory : "console":{"class":{"write":Console,"read":Console}}, "file":{"class":{"read":disk.DiskReader,"write":disk.DiskWriter}}, "sqlite":{"class":{"read":disk.SQLiteReader,"write":disk.SQLiteWriter}}, - "postgresql":{"port":5432,"host":"localhost","database":os.environ['USER'],"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - "redshift":{"port":5432,"host":"localhost","database":os.environ['USER'],"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, + "postgresql":{"port":5432,"host":"localhost","driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, + "redshift":{"port":5432,"host":"localhost","driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, "bigquery":{"class":{"read":sql.BQReader,"write":sql.BQWriter}}, "mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my,"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my,"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, diff --git a/transport/common.py b/transport/common.py index e6578a6..8234290 100644 --- a/transport/common.py +++ b/transport/common.py @@ -98,7 +98,7 @@ class Console(Writer): self.debug = self.write self.log = self.write pass - def write (self,**_args): + def write (self,logs,**_args): if self.lock : Console.lock.acquire() try: diff --git a/transport/mongo.py b/transport/mongo.py index f6fa12b..50f463f 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -33,37 +33,59 @@ class Mongo : :username username for authentication :password password for current user """ - port = str(args['port']) if 'port' in args else '27017' - host = args['host'] if 'host' in args else 'localhost' - host = ":".join([host,port]) #-- Formatting host information here - self.uid = args['doc'] if 'doc' in args else None #-- document identifier - self.dbname = args['dbname'] if 'dbname' in args else args['db'] - authMechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism'] - authSource=(args['authSource'] if 'authSource' in args else self.dbname) + # port = str(args['port']) if 'port' in args else '27017' + # host = args['host'] if 'host' in args else 'localhost' + # host = ":".join([host,port]) #-- Formatting host information here + # self.uid = args['doc'] if 'doc' in args else None #-- document identifier + # self.dbname = args['dbname'] if 'dbname' in args else args['db'] + self.authMechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism'] + # authSource=(args['authSource'] if 'authSource' in args else self.dbname) self._lock = False if 'lock' not in args else args['lock'] username = password = None - if 'username' in args and 'password' in args: - username = args['username'] - password=args['password'] + # if 'username' in args and 'password' in args: + # username = args['username'] + # password=args['password'] if 'auth_file' in args : _info = json.loads((open(args['auth_file'])).read()) - username = _info['username'] - password = _info['password'] - if 'mechanism' in _info: - authMechanism = _info['mechanism'] - if 'authSource' in _info: - authSource = _info['authSource'] - - + # username = _info['username'] + # password = _info['password'] + # if 'mechanism' in _info: + # authMechanism = _info['mechanism'] + # if 'authSource' in _info: + # authSource = _info['authSource'] + # # + # # We are allowing the authentication file to set collection and databases too + # if 'db' in _info : + # self.dbname = _info['db'] + # if 'doc' in _info : + # self.uid = _info['doc'] + + else: + _info = {} + _args = dict(args,**_info) + for key in _args : + if key in ['username','password'] : + username = _args['username'] if key=='username' else username + password = _args['password'] if key == 'password' else password + continue + value = _args[key] + + self.setattr(key,value) + # + # Let us perform aliasing in order to remain backwards compatible + + self.dbname = self.db if hasattr(self,'db')else self.dbname + self.uid = _args['table'] if 'table' in _args else (_args['doc'] if 'doc' in _args else (_args['collection'] if 'collection' in _args else None)) if username and password : - self.client = MongoClient(host, + self.client = MongoClient(self.host, username=username, password=password , - authSource=authSource, - authMechanism=authMechanism) + authSource=self.authSource, + authMechanism=self.authMechanism) + else: - self.client = MongoClient(host,maxPoolSize=10000) + self.client = MongoClient(self.host,maxPoolSize=10000) self.db = self.client[self.dbname] @@ -71,6 +93,11 @@ class Mongo : p = self.dbname in self.client.list_database_names() q = self.uid in self.client[self.dbname].list_collection_names() return p and q + def setattr(self,key,value): + _allowed = ['host','port','db','doc','authSource'] + if key in _allowed : + setattr(self,key,value) + pass def close(self): self.client.close() @@ -110,7 +137,9 @@ class MongoReader(Mongo,Reader): else: collection = self.db[self.uid] _filter = args['filter'] if 'filter' in args else {} - return collection.find(_filter) + _df = pd.DataFrame(collection.find(_filter)) + columns = _df.columns.tolist()[1:] + return _df[columns] def view(self,**args): """ This function is designed to execute a view (map/reduce) operation @@ -162,7 +191,7 @@ class MongoWriter(Mongo,Writer): @param info new record in the collection to be added """ # document = self.db[self.uid].find() - collection = self.db[self.uid] + #collection = self.db[self.uid] # if type(info) == list : # self.db[self.uid].insert_many(info) # else: From b0c6bb6ca9a5ad5b4534182f8f4501a9dd5c1a1d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 25 Jun 2022 14:00:22 -0500 Subject: [PATCH 103/271] bug fix: environment variable on default database --- transport/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transport/__init__.py b/transport/__init__.py index 2f6f5ac..5915b21 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -61,8 +61,8 @@ class factory : "console":{"class":{"write":Console,"read":Console}}, "file":{"class":{"read":disk.DiskReader,"write":disk.DiskWriter}}, "sqlite":{"class":{"read":disk.SQLiteReader,"write":disk.SQLiteWriter}}, - "postgresql":{"port":5432,"host":"localhost","driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - "redshift":{"port":5432,"host":"localhost","driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, + "postgresql":{"port":5432,"host":"localhost","database":None,"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, + "redshift":{"port":5432,"host":"localhost","database":None,"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, "bigquery":{"class":{"read":sql.BQReader,"write":sql.BQWriter}}, "mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my,"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my,"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, From ae1a4cec733ac93881899a24925dfd78461c0cc9 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 26 Jun 2022 12:18:12 -0500 Subject: [PATCH 104/271] bug fix: removing print statements --- transport/disk.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/transport/disk.py b/transport/disk.py index 16e57de..6963694 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -235,8 +235,6 @@ class SQLiteWriter(SQLite,DiskWriter) : cursor = self.conn.cursor() sql = " " .join(["INSERT INTO ",self.table,"(", ",".join(self.fields) ,")", "values(:values)"]) for row in info : - print (row) - print (row.values()) stream =["".join(["",value,""]) if type(value) == str else value for value in row.values()] stream = json.dumps(stream).replace("[","").replace("]","") From 8a6beae956b9aed0645eda26784ba209be0407a7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 26 Jun 2022 15:48:29 -0500 Subject: [PATCH 105/271] bug fix: sql handler ... --- transport/disk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transport/disk.py b/transport/disk.py index 6963694..dad2f33 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -189,7 +189,7 @@ class SQLiteWriter(SQLite,DiskWriter) : # self.conn.row_factory = sqlite3.Row # self.fields = args['fields'] if 'fields' in args else [] - if self.fields and not self.isready(): + if self.fields and not self.isready() and self.table: self.init(self.fields) SQLiteWriter.connection = self.conn def init(self,fields): @@ -210,7 +210,7 @@ class SQLiteWriter(SQLite,DiskWriter) : r = r.fetchall() cursor.close() - return r[0][0] + return r[0][0] != 0 except Exception as e: pass return 0 From 51512c39e1e4606d069678cd99ea5ab1d830abf7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 26 Jun 2022 15:48:54 -0500 Subject: [PATCH 106/271] version update 1.5.6 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e4b84a5..2983f70 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.5.6", + "version":"1.5.7", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} From 245b319e7b8b0b80234f2a2f3549a026a1c9468a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 3 Aug 2022 12:07:27 -0500 Subject: [PATCH 107/271] added partitioning and chunking to support healthy ETL jobs or write functions --- setup.py | 2 +- transport/common.py | 9 +++--- transport/etl.py | 52 +++++++++++++++++-------------- transport/sql.py | 76 ++++++++++++++++++++++++++++++--------------- 4 files changed, 86 insertions(+), 53 deletions(-) diff --git a/setup.py b/setup.py index 2983f70..aecb441 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.5.7", + "version":"1.5.8", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/common.py b/transport/common.py index 8234290..d9a3fab 100644 --- a/transport/common.py +++ b/transport/common.py @@ -98,15 +98,16 @@ class Console(Writer): self.debug = self.write self.log = self.write pass - def write (self,logs,**_args): + def write (self,logs=None,**_args): if self.lock : Console.lock.acquire() try: - if type(_args) == list: - for row in _args : + _params = _args if logs is None and _args else logs + if type(_params) == list: + for row in _params : print (row) else: - print (_args) + print (_params) except Exception as e : print (e) finally: diff --git a/transport/etl.py b/transport/etl.py index 6783cc6..83c6147 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -56,20 +56,22 @@ if len(sys.argv) > 1: class Post(Process): def __init__(self,**args): super().__init__() - + self.store = args['target'] if 'provider' not in args['target'] : + pass self.PROVIDER = args['target']['type'] - self.writer = transport.factory.instance(**args['target']) + # self.writer = transport.factory.instance(**args['target']) else: self.PROVIDER = args['target']['provider'] - args['target']['context'] = 'write' - self.store = args['target'] + self.store['context'] = 'write' + # self.store = args['target'] self.store['lock'] = True # self.writer = transport.instance(**args['target']) # # If the table doesn't exists maybe create it ? # - self.rows = args['rows'].fillna('') + self.rows = args['rows'] + # self.rows = args['rows'].fillna('') def log(self,**_args) : if ETL.logger : @@ -77,20 +79,7 @@ class Post(Process): def run(self): _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows - ltypes = self.rows.dtypes.values - columns = self.rows.dtypes.index.tolist() - # if not self.writer.has() : - - - # self.writer.make(fields=columns) - # ETL.logger.info(module='write',action='make-table',input={"name":self.writer.table}) - self.log(module='write',action='make-table',input={"schema":columns}) - for name in columns : - if _info[name].dtype in ['int32','int64','int','float','float32','float64'] : - value = 0 - else: - value = '' - _info[name] = _info[name].fillna(value) + writer = transport.factory.instance(**self.store) writer.write(_info) writer.close() @@ -149,9 +138,11 @@ class ETL (Process): # _id = ' '.join([str(i),' table ',self.name]) indexes = rows[i] segment = idf.loc[indexes,:].copy() #.to_dict(orient='records') + _name = "partition-"+str(i) if segment.shape[0] == 0 : continue - proc = Post(target = self._oargs,rows = segment,name=str(i)) + + proc = Post(target = self._oargs,rows = segment,name=_name) self.jobs.append(proc) proc.start() @@ -167,17 +158,31 @@ class ETL (Process): return len(self.jobs) == 0 def instance(**_args): """ + :path ,index, id :param _info list of objects with {source,target}` :param logger """ logger = _args['logger'] if 'logger' in _args else None - _info = _args['info'] + if 'path' in _args : + _info = json.loads((open(_args['path'])).read()) + + + if 'index' in _args : + _index = int(_args['index']) + _info = _info[_index] + + elif 'id' in _args : + _info = [_item for _item in _info if '_id' in _item and _item['id'] == _args['id']] + _info = _info[0] if _info else _info + else: + _info = _args['info'] + if logger and type(logger) != str: ETL.logger = logger elif logger == 'console': - ETL.logger = transport.factory.instance(provider='console',lock=True) + ETL.logger = transport.factory.instance(provider='console',context='write',lock=True) if type(_info) in [list,dict] : - _config = _info if type(_info) != dict else [_info] + _info = _info if type(_info) != dict else [_info] # # The assumption here is that the objects within the list are {source,target} jobs = [] @@ -185,6 +190,7 @@ def instance(**_args): _item['jobs'] = 5 if 'procs' not in _args else int(_args['procs']) _job = ETL(**_item) + _job.start() jobs.append(_job) return jobs diff --git a/transport/sql.py b/transport/sql.py index 50ba1ed..08f3648 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -8,6 +8,10 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +@TODO: + - Migrate SQLite to SQL hierarchy + - Include Write in Chunks from pandas """ import psycopg2 as pg import mysql.connector as my @@ -31,6 +35,7 @@ import os class SQLRW : lock = RLock() + MAX_CHUNK = 2000000 DRIVERS = {"postgresql":pg,"redshift":pg,"mysql":my,"mariadb":my,"netezza":nz} REFERENCE = { "netezza":{"port":5480,"handler":nz,"dtype":"VARCHAR(512)"}, @@ -47,6 +52,7 @@ class SQLRW : self.table = _args['table'] if 'table' in _args else None self.fields = _args['fields'] if 'fields' in _args else [] self.schema = _args['schema'] if 'schema' in _args else '' + self._chunks = 1 if 'chunks' not in _args else int(_args['chunks']) self._provider = _args['provider'] if 'provider' in _args else None # _info['host'] = 'localhost' if 'host' not in _args else _args['host'] @@ -103,6 +109,13 @@ class SQLRW : _m = sqlalchemy.MetaData(bind=self._engine) _m.reflect() schema = [{"name":_attr.name,"type":str(_attr.type)} for _attr in _m.tables[table].columns] + # + # Some house keeping work + _m = {'BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'} + for _item in schema : + if _item['type'] in _m : + _item['type'] = _m[_item['type']] + except Exception as e: pass return schema @@ -258,14 +271,7 @@ class SQLWriter(SQLRW,Writer): # _fields = info.keys() if type(info) == dict else info[0].keys() _fields = list (_fields) self.init(_fields) - # - # @TODO: Use pandas/odbc ? Not sure b/c it requires sqlalchemy - # - # if type(info) != list : - # # - # # We are assuming 2 cases i.e dict or pd.DataFrame - # info = [info] if type(info) == dict else info.values.tolist() - + try: table = _args['table'] if 'table' in _args else self.table table = self._tablename(table) @@ -284,22 +290,36 @@ class SQLWriter(SQLRW,Writer): return if self.lock : SQLRW.lock.acquire() - - if self._engine is not None: - # pd.to_sql(_info,self._engine) - if self.schema in ['',None] : - rows = _info.to_sql(table,self._engine,if_exists='append',index=False) - else: - rows = _info.to_sql(self.table,self._engine,schema=self.schema,if_exists='append',index=False) + # + # we will adjust the chunks here in case we are not always sure of the + if self._chunks == 1 and _info.shape[0] > SQLRW.MAX_CHUNK : + self._chunks = 10 + _indexes = np.array_split(np.arange(_info.shape[0]),self._chunks) + for i in _indexes : + # + # In case we have an invalid chunk ... + if _info.iloc[i].shape[0] == 0 : + continue + # + # We are enabling writing by chunks/batches because some persistent layers have quotas or limitations on volume of data - else: - _fields = ",".join(self.fields) - _sql = _sql.replace(":fields",_fields) - values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) - _sql = _sql.replace(":values",values) - cursor = self.conn.cursor() - cursor.executemany(_sql,_info.values.tolist()) - cursor.close() + if self._engine is not None: + # pd.to_sql(_info,self._engine) + if self.schema in ['',None] : + rows = _info.iloc[i].to_sql(table,self._engine,if_exists='append',index=False) + else: + # + # Writing with schema information ... + rows = _info.iloc[i].to_sql(self.table,self._engine,schema=self.schema,if_exists='append',index=False) + + else: + _fields = ",".join(self.fields) + _sql = _sql.replace(":fields",_fields) + values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) + _sql = _sql.replace(":values",values) + cursor = self.conn.cursor() + cursor.executemany(_sql,_info.iloc[i].values.tolist()) + cursor.close() # cursor.commit() # self.conn.commit() @@ -382,6 +402,7 @@ class BQWriter(BigQuery,Writer): self.parallel = False if 'lock' not in _args else _args['lock'] self.table = _args['table'] if 'table' in _args else None self.mode = {'if_exists':'append','chunksize':900000,'destination_table':self.table,'credentials':self.credentials} + self._chunks = 1 if 'chunks' not in _args else int(_args['chunks']) def write(self,_info,**_args) : try: @@ -409,8 +430,13 @@ class BQWriter(BigQuery,Writer): self.mode['table_schema'] = _args['schema'] # _mode = copy.deepcopy(self.mode) _mode = self.mode - _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) - + # _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) + # + # Let us adjust the chunking here + self._chunkks = 10 if _df.shape[0] > SQLRW.MAX_CHUNK and self._chunks == 1 else self._chunks + _indexes = np.array_split(np.arange(_df.shape[0]),self._chunks) + for i in _indexes : + _df.iloc[i].to_gbq(**self.mode) pass # # Aliasing the big query classes allowing it to be backward compatible From e5f3b1933657d3616cd256a95f6e774d1e005f05 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 10 Aug 2022 09:17:30 -0500 Subject: [PATCH 108/271] bug fix: added chunks and enhacement with global variables (constants) --- setup.py | 4 ++-- transport/__init__.py | 44 ++++++++++++++++++++++++++++++++++++++----- transport/common.py | 2 ++ transport/disk.py | 4 +++- transport/mongo.py | 2 +- transport/sql.py | 4 +++- 6 files changed, 50 insertions(+), 10 deletions(-) diff --git a/setup.py b/setup.py index aecb441..2e6991e 100644 --- a/setup.py +++ b/setup.py @@ -8,12 +8,12 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.5.8", + "version":"1.6.0", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pymongo','sqlalchemy','pandas','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] +args["install_requires"] = ['nujson','pymongo','sqlalchemy','pandas','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] if sys.version_info[0] == 2 : diff --git a/transport/__init__.py b/transport/__init__.py index 5915b21..b4e80fb 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -52,8 +52,38 @@ from google.cloud import bigquery as bq import nzpy as nz #--- netezza drivers import os +class providers : + POSTGRESQL = 'postgresql' + MONGODB = 'mongodb' + BIGQUERY ='bigquery' + FILE = 'file' + ETL = 'etl' + SQLITE = 'sqlite' + REDSHIFT = 'redshift' + NETEZZA = 'netezza' + MYSQL = 'mysql' + RABBITMQ = 'rabbitmq' + MARIADB = 'mariadb' + COUCHDB = 'couch' + CONSOLE = 'console' + ETL = 'etl' + # + # synonyms of the above + BQ = BIGQUERY + MONGO = MONGODB + PG = POSTGRESQL + PSQL = POSTGRESQL - +class IEncoder (json.JSONEncoder): + def default (self,object): + if type(object) == np.integer : + return int(object) + elif type(object) == np.floating: + return float(object) + elif type(object) == np.ndarray : + return object.tolist() + else: + return super(IEncoder,self).default(object) class factory : TYPE = {"sql":{"providers":["postgresql","mysql","neteeza","bigquery","mariadb","redshift"]}} PROVIDERS = { @@ -149,9 +179,10 @@ def instance(**_args): # # Let us try to establish an sqlalchemy wrapper try: - + account = '' host = '' - if provider not in ['bigquery','mongodb','mongo','couchdb','sqlite','console','etl','file','rabbitmq'] : + if provider not in [providers.BIGQUERY,providers.MONGODB, providers.COUCHDB, providers.SQLITE, providers.CONSOLE,providers.ETL, providers.FILE, providers.RABBITMQ] : + # if provider not in ['bigquery','mongodb','mongo','couchdb','sqlite','console','etl','file','rabbitmq'] : # # In these cases we are assuming RDBMS and thus would exclude NoSQL and BigQuery username = args['username'] if 'username' in args else '' @@ -165,11 +196,13 @@ def instance(**_args): host = host+":"+str(args['port']) database = args['database'] - elif provider == 'sqlite': + elif provider in [providers.SQLITE,providers.FILE]: account = '' host = '' database = args['path'] if 'path' in args else args['database'] - if provider not in ['mongodb','mongo','couchdb','bigquery','console','etl','file','rabbitmq'] : + + if provider not in [providers.MONGODB, providers.COUCHDB, providers.BIGQUERY, providers.CONSOLE, providers.ETL,providers.FILE,providers.RABBITMQ] : + # if provider not in ['mongodb','mongo','couchdb','bigquery','console','etl','file','rabbitmq'] : uri = ''.join([provider,"://",account,host,'/',database]) e = sqlalchemy.create_engine (uri,future=True) @@ -178,6 +211,7 @@ def instance(**_args): # # @TODO: Include handling of bigquery with SQLAlchemy except Exception as e: + print (_args) print (e) return pointer(**args) diff --git a/transport/common.py b/transport/common.py index d9a3fab..2ed7cd2 100644 --- a/transport/common.py +++ b/transport/common.py @@ -24,6 +24,8 @@ import importlib from multiprocessing import RLock # import couch # import mongo + + class IO: def init(self,**args): """ diff --git a/transport/disk.py b/transport/disk.py index dad2f33..0eb42fe 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -1,10 +1,12 @@ import os import sys + + if sys.version_info[0] > 2 : from transport.common import Reader, Writer #, factory else: from common import Reader,Writer -import json +import nujson as json # from threading import Lock import sqlite3 import pandas as pd diff --git a/transport/mongo.py b/transport/mongo.py index 50f463f..e212047 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -7,7 +7,7 @@ This file is a wrapper around mongodb for reading/writing content against a mong from pymongo import MongoClient from bson.objectid import ObjectId from bson.binary import Binary -import json +import nujson as json from datetime import datetime import pandas as pd diff --git a/transport/sql.py b/transport/sql.py index 08f3648..2f4d96f 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -357,7 +357,9 @@ class BigQuery: table = _args['table'] try: ref = self.client.dataset(self.dataset).table(table) - return self.client.get_table(ref).schema + _schema = self.client.get_table(ref).schema + return [{"name":_item.name,"type":_item.field_type,"description":( "" if not hasattr(_item,"description") else _item.description )} for _item in _schema] + except Exception as e: return [] def has(self,**_args): From 3a3946c7d88418146b04cb180e7f22dd0341b20e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 19 Aug 2022 15:32:35 -0500 Subject: [PATCH 109/271] bug fix --- transport/mongo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transport/mongo.py b/transport/mongo.py index e212047..bf20482 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -108,6 +108,7 @@ class MongoReader(Mongo,Reader): def __init__(self,**args): Mongo.__init__(self,**args) def read(self,**args): + if 'mongo' in args : # # @TODO: From 8bb495842ad6150d54bdafe23daed4af89b1ab6f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 19 Aug 2022 16:00:37 -0500 Subject: [PATCH 110/271] documentation ... --- README.md | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0add2c7..c641952 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,8 @@ Mostly data scientists that don't really care about the underlying database and 1. Familiarity with **pandas data-frames** 2. Connectivity **drivers** are included -3. Useful for data migrations or ETL +3. Mining data from various sources +4. Useful for data migrations or ETL # Usage @@ -35,7 +36,8 @@ Within the virtual environment perform the following : pip install git+https://dev.the-phi.com/git/steve/data-transport.git -Once installed **data-transport** can be used as a library in code or a command line interface (CLI) +Once installed **data-transport** can be used as a library in code or a command line interface (CLI), as a CLI it is used for ETL and requires a configuration file. + ## Data Transport as a Library (in code) --- @@ -112,12 +114,71 @@ df = reader.read(mongo=_command) print (df.head()) reader.close() ``` -**Writing to Mongodb** +**Read/Writing to Mongodb** --- + +Scenario 1: Mongodb with security in place + +1. Define an authentication file on disk + + The semantics of the attributes are provided by mongodb, please visit [mongodb documentation](https://mongodb.org/docs). In this example the file is located on _/transport/mongo.json_ +
+
+configuration file + +``` +{ + "username":"me","password":"changeme", + "mechanism":"SCRAM-SHA-1", + "authSource":"admin" +} +``` +Connecting to Mongodb + +``` +import transport +PIPELINE = ... #-- do this yourself +MONGO_KEY = '/transport/mongo.json' +mreader = transport.factory.instance(provider=transport.providers.MONGODB,auth_file=MONGO_KEY,context='read',db='mydb',doc='logs') +_aggregateDF = mreader.read(mongo=PIPELINE) #--results of a aggregate pipeline +_collectionDF= mreader.read() + + +``` + +In order to enable write, change **context** attribute to **'read'**. +
+
+- The configuration file is in JSON format +- The commands passed to mongodb are the same as you would if you applied runCommand in mongodb +- The output is a pandas data-frame +- By default the transport reads, to enable write operations use **context='write'** + +|parameters|description | +| --- | --- | +|db| Name of the database| +|port| Port number to connect to +|doc| Name of the collection of documents| +|username|Username | +|password|password| +|authSource|user database that has authentication info| +|mechanism|Mechnism used for authentication| + +**NOTE** + +Arguments like **db** or **doc** can be placed in the authentication file +
+
+ +**Limitations** + +Reads and writes aren't encapsulated in the same object, this is to allow the calling code to deliberately perform actions and hopefully minimize accidents associated with data wrangling. + + ``` import transport improt pandas as pd -writer = factory.instance(provider='mongodb',context='write',host='localhost',port='27018',db='example',doc='logs') +writer = factory.instance(provider=transport.providers.MONGODB,context='write',host='localhost',port='27018',db='example',doc='logs') df = pd.DataFrame({"names":["steve","nico"],"age":[40,30]}) writer.write(df) From 883a6ef22f8fa5b08b84fb4f0511eb982c29b96f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 19 Sep 2022 10:01:34 -0500 Subject: [PATCH 111/271] bug fix & new feature --- transport/__init__.py | 1 + transport/common.py | 19 ++++++++++++++++++- transport/mongo.py | 32 ++++++++++---------------------- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/transport/__init__.py b/transport/__init__.py index b4e80fb..baa960e 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -59,6 +59,7 @@ class providers : FILE = 'file' ETL = 'etl' SQLITE = 'sqlite' + SQLITE3= 'sqlite' REDSHIFT = 'redshift' NETEZZA = 'netezza' MYSQL = 'mysql' diff --git a/transport/common.py b/transport/common.py index 2ed7cd2..73f42ac 100644 --- a/transport/common.py +++ b/transport/common.py @@ -115,4 +115,21 @@ class Console(Writer): finally: if self.lock : Console.lock.release() - +""" +@NOTE : Experimental !! +""" +class Proxy : + """ + This class will forward a call to a function that is provided by the user code + """ + def __init__(self,**_args): + self.callback = _args['callback'] + def read(self,**_args) : + try: + return self.callback(**_args) + except Exception as e: + return self.callback() + + pass + def write(self,data,**_args): + self.callback(data,**_args) diff --git a/transport/mongo.py b/transport/mongo.py index bf20482..eb5f2da 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -10,7 +10,7 @@ from bson.binary import Binary import nujson as json from datetime import datetime import pandas as pd - +import numpy as np import gridfs # from transport import Reader,Writer import sys @@ -33,33 +33,15 @@ class Mongo : :username username for authentication :password password for current user """ - # port = str(args['port']) if 'port' in args else '27017' - # host = args['host'] if 'host' in args else 'localhost' - # host = ":".join([host,port]) #-- Formatting host information here - # self.uid = args['doc'] if 'doc' in args else None #-- document identifier - # self.dbname = args['dbname'] if 'dbname' in args else args['db'] + self.authMechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism'] # authSource=(args['authSource'] if 'authSource' in args else self.dbname) self._lock = False if 'lock' not in args else args['lock'] username = password = None - # if 'username' in args and 'password' in args: - # username = args['username'] - # password=args['password'] if 'auth_file' in args : _info = json.loads((open(args['auth_file'])).read()) - # username = _info['username'] - # password = _info['password'] - # if 'mechanism' in _info: - # authMechanism = _info['mechanism'] - # if 'authSource' in _info: - # authSource = _info['authSource'] - # # - # # We are allowing the authentication file to set collection and databases too - # if 'db' in _info : - # self.dbname = _info['db'] - # if 'doc' in _info : - # self.uid = _info['doc'] + else: _info = {} @@ -100,7 +82,8 @@ class Mongo : pass def close(self): self.client.close() - + def meta(self,**_args): + return [] class MongoReader(Mongo,Reader): """ This class will read from a mongodb data store and return the content of a document (not a collection) @@ -113,6 +96,11 @@ class MongoReader(Mongo,Reader): # # @TODO: cmd = args['mongo'] + if "aggregate" in cmd : + if "allowDiskUse" not in cmd : + cmd["allowDiskUse"] = True + if "cursor" not in cmd : + cmd["cursor"] = {} r = [] out = self.db.command(cmd) #@TODO: consider using a yield (generator) works wonders From 2c675d56567c6f023f12151c5455d050d213ab10 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 13 Nov 2022 14:58:15 -0600 Subject: [PATCH 112/271] bug fix, mongodb authentication --- mongo.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 mongo.json diff --git a/mongo.json b/mongo.json new file mode 100644 index 0000000..2479c47 --- /dev/null +++ b/mongo.json @@ -0,0 +1,4 @@ +{ + "authSource":"admin", + "username":"steve","password":"Innovat10n","db":"IOV01_RESEARCH","collection":"logs" +} From b8fb538ec7f243aef1c3e764b6c34367b574f556 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 13 Nov 2022 15:45:21 -0600 Subject: [PATCH 113/271] bug fix mongodb,bigquery --- setup.py | 2 +- transport/mongo.py | 10 +++++----- transport/sql.py | 3 +++ 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 2e6991e..4c79ec7 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.6.0", + "version":"1.6.1", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/mongo.py b/transport/mongo.py index eb5f2da..e3dbaff 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -34,7 +34,7 @@ class Mongo : :password password for current user """ - self.authMechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism'] + self.mechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism'] # authSource=(args['authSource'] if 'authSource' in args else self.dbname) self._lock = False if 'lock' not in args else args['lock'] @@ -64,7 +64,7 @@ class Mongo : username=username, password=password , authSource=self.authSource, - authMechanism=self.authMechanism) + authMechanism=self.mechanism) else: self.client = MongoClient(self.host,maxPoolSize=10000) @@ -76,7 +76,7 @@ class Mongo : q = self.uid in self.client[self.dbname].list_collection_names() return p and q def setattr(self,key,value): - _allowed = ['host','port','db','doc','authSource'] + _allowed = ['host','port','db','doc','authSource','mechanism'] if key in _allowed : setattr(self,key,value) pass @@ -92,10 +92,10 @@ class MongoReader(Mongo,Reader): Mongo.__init__(self,**args) def read(self,**args): - if 'mongo' in args : + if 'mongo' in args or 'cmd' : # # @TODO: - cmd = args['mongo'] + cmd = args['mongo'] if 'mongo' in args else args['cmd'] if "aggregate" in cmd : if "allowDiskUse" not in cmd : cmd["allowDiskUse"] = True diff --git a/transport/sql.py b/transport/sql.py index 2f4d96f..9d278a3 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -430,6 +430,9 @@ class BQWriter(BigQuery,Writer): self.mode['destination_table'] = _args['table'].strip() if 'schema' in _args : self.mode['table_schema'] = _args['schema'] + # + # Let us insure that the types are somewhat compatible ... + # _map = {'INTEGER':np.int64,'DATETIME':'datetime64[ns]','TIMESTAMP':'datetime64[ns]','FLOAT':np.float64,'DOUBLE':np.float64,'STRING':str} # _mode = copy.deepcopy(self.mode) _mode = self.mode # _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) From 1c254eb133c9463c604cf0394b59b0119a57a6e1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 9 Dec 2022 16:19:39 -0600 Subject: [PATCH 114/271] bug fixes, enhancements mongodb --- README.md | 6 ++++-- setup.py | 2 +- transport/sql.py | 4 ++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c641952..87e5b1e 100644 --- a/README.md +++ b/README.md @@ -42,11 +42,13 @@ Once installed **data-transport** can be used as a library in code or a command ## Data Transport as a Library (in code) --- -The data-transport can be used within code as a library +The data-transport can be used within code as a library, and offers the following capabilities: + * Read/Write against [mongodb](https://github.com/lnyemba/data-transport/wiki/mongodb) * Read/Write against tranditional [RDBMS](https://github.com/lnyemba/data-transport/wiki/rdbms) * Read/Write against [bigquery](https://github.com/lnyemba/data-transport/wiki/bigquery) * ETL CLI/Code [ETL](https://github.com/lnyemba/data-transport/wiki/etl) +* Support for pre/post conditions i.e it is possible to specify queries to run before or after a read or write The read/write functions make data-transport a great candidate for **data-science**; **data-engineering** or all things pertaining to data. It enables operations across multiple data-stores(relational or not) @@ -60,7 +62,7 @@ It is possible to perform ETL within custom code as follows : import transport import time - _info = [{source:{'provider':'sqlite','path':'/home/me/foo.csv','table':'me'},target:{provider:'bigquery',private_key='/home/me/key.json','table':'me','dataset':'mydataset'}}, ...] + _info = [{source:{'provider':'sqlite','path':'/home/me/foo.csv','table':'me',"pipeline":{"pre":[],"post":[]}},target:{provider:'bigquery',private_key='/home/me/key.json','table':'me','dataset':'mydataset'}}, ...] procs = transport.factory.instance(provider='etl',info=_info) # # diff --git a/setup.py b/setup.py index 4c79ec7..b9dbb37 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.6.1", + "version":"1.6.2", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/sql.py b/transport/sql.py index 9d278a3..f6e196c 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -192,6 +192,10 @@ class SQLReader(SQLRW,Reader) : _sql = _sql.replace(":fields",_fields) if 'limit' in _args : _sql = _sql + " LIMIT "+str(_args['limit']) + # + # @TODO: + # It is here that we should inspect to see if there are any pre/post conditions + # return self.apply(_sql) def close(self) : try: From f59dcd544f26c552b013afcb6dfa3cd4d08d2c24 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 10 Dec 2022 11:51:08 -0600 Subject: [PATCH 115/271] oopsie --- mongo.json | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 mongo.json diff --git a/mongo.json b/mongo.json deleted file mode 100644 index 2479c47..0000000 --- a/mongo.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "authSource":"admin", - "username":"steve","password":"Innovat10n","db":"IOV01_RESEARCH","collection":"logs" -} From 98c9348f9c1db16ab58645f4ea01b52d97824269 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 14 Dec 2022 16:48:40 -0600 Subject: [PATCH 116/271] bug fix: mongodb read --- README.md | 2 +- setup.py | 2 +- transport/mongo.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 87e5b1e..eaa176d 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ Mostly data scientists that don't really care about the underlying database and Within the virtual environment perform the following : - pip install git+https://dev.the-phi.com/git/steve/data-transport.git + pip install git+https://github.com/lnyemba/data-transport.git Once installed **data-transport** can be used as a library in code or a command line interface (CLI), as a CLI it is used for ETL and requires a configuration file. diff --git a/setup.py b/setup.py index b9dbb37..adbea05 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.6.2", + "version":"1.6.3", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/mongo.py b/transport/mongo.py index e3dbaff..dc72598 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -92,7 +92,7 @@ class MongoReader(Mongo,Reader): Mongo.__init__(self,**args) def read(self,**args): - if 'mongo' in args or 'cmd' : + if 'mongo' in args or 'cmd' in args: # # @TODO: cmd = args['mongo'] if 'mongo' in args else args['cmd'] From c2317dba9a5b83156af8dde4934a0258477de97a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 4 Jan 2023 09:08:24 -0600 Subject: [PATCH 117/271] bug fix: deprecated json library --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index adbea05..18c0d82 100644 --- a/setup.py +++ b/setup.py @@ -8,12 +8,12 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.6.3", + "version":"1.6.4", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['nujson','pymongo','sqlalchemy','pandas','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] +args["install_requires"] = ['pymongo','sqlalchemy','pandas','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] if sys.version_info[0] == 2 : From b92055f018c486c1e6cbf4a7e61b3f75a782976d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 4 Jan 2023 09:12:14 -0600 Subject: [PATCH 118/271] bug fix: disk - nujson, refactor --- transport/disk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/disk.py b/transport/disk.py index 0eb42fe..34875ad 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -6,7 +6,7 @@ if sys.version_info[0] > 2 : from transport.common import Reader, Writer #, factory else: from common import Reader,Writer -import nujson as json +# import nujson as json # from threading import Lock import sqlite3 import pandas as pd From dcd6a83a35746e725ab5989ceb97d566db4c52af Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 4 Jan 2023 09:12:55 -0600 Subject: [PATCH 119/271] bug fix: disk - nujson, refactor --- transport/disk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transport/disk.py b/transport/disk.py index 34875ad..8514e3f 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -7,6 +7,7 @@ if sys.version_info[0] > 2 : else: from common import Reader,Writer # import nujson as json +import json # from threading import Lock import sqlite3 import pandas as pd From 5f9416d2f6f897c6b7e57e61b2cdf2ea6c157459 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 18 Jan 2023 21:13:38 -0600 Subject: [PATCH 120/271] bug fix: removing unused library --- transport/mongo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/mongo.py b/transport/mongo.py index dc72598..0ceb1e0 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -7,7 +7,7 @@ This file is a wrapper around mongodb for reading/writing content against a mong from pymongo import MongoClient from bson.objectid import ObjectId from bson.binary import Binary -import nujson as json +# import nujson as json from datetime import datetime import pandas as pd import numpy as np From 64f0b3ca80101ac753c6cef29e0ccf3ed4b1a9c6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 18 Jan 2023 21:14:09 -0600 Subject: [PATCH 121/271] version increment --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 18c0d82..9d9fdcf 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.6.4", + "version":"1.6.6", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} From 695c10e797fbeeeb64fb05a815eac49fe2bd73e1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 25 May 2023 09:39:51 -0500 Subject: [PATCH 122/271] bug fix, added ferretdb --- setup.py | 2 +- transport/__init__.py | 3 +++ transport/common.py | 3 +++ transport/mongo.py | 15 +++++++++++++-- transport/sql.py | 2 ++ 5 files changed, 22 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 9d9fdcf..483ea87 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.6.6", + "version":"1.6.8", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/__init__.py b/transport/__init__.py index baa960e..f43a184 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -55,6 +55,7 @@ import os class providers : POSTGRESQL = 'postgresql' MONGODB = 'mongodb' + BIGQUERY ='bigquery' FILE = 'file' ETL = 'etl' @@ -72,8 +73,10 @@ class providers : # synonyms of the above BQ = BIGQUERY MONGO = MONGODB + FERRETDB= MONGODB PG = POSTGRESQL PSQL = POSTGRESQL + PGSQL = POSTGRESQL class IEncoder (json.JSONEncoder): def default (self,object): diff --git a/transport/common.py b/transport/common.py index 73f42ac..39df6a3 100644 --- a/transport/common.py +++ b/transport/common.py @@ -22,6 +22,7 @@ import numpy as np import json import importlib from multiprocessing import RLock +import queue # import couch # import mongo @@ -115,6 +116,8 @@ class Console(Writer): finally: if self.lock : Console.lock.release() + + """ @NOTE : Experimental !! """ diff --git a/transport/mongo.py b/transport/mongo.py index 0ceb1e0..8ab4418 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -124,7 +124,15 @@ class MongoReader(Mongo,Reader): return pd.DataFrame(r) else: - collection = self.db[self.uid] + if 'table' in args or 'collection' in args : + if 'table' in args: + _uid = args['table'] + elif 'collection' in args : + _uid = args['collection'] + else: + _uid = self.uid + + collection = self.db[_uid] _filter = args['filter'] if 'filter' in args else {} _df = pd.DataFrame(collection.find(_filter)) columns = _df.columns.tolist()[1:] @@ -185,7 +193,10 @@ class MongoWriter(Mongo,Writer): # self.db[self.uid].insert_many(info) # else: try: - _uid = self.uid if 'doc' not in _args else _args['doc'] + if 'table' in _args or 'collection' in _args : + _uid = _args['table'] if 'table' in _args else _args['collection'] + else: + _uid = self.uid if 'doc' not in _args else _args['doc'] if self._lock : Mongo.lock.acquire() if type(info) == list or type(info) == pd.DataFrame : diff --git a/transport/sql.py b/transport/sql.py index f6e196c..05d254d 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -278,7 +278,9 @@ class SQLWriter(SQLRW,Writer): try: table = _args['table'] if 'table' in _args else self.table + self.schema = _args['schema'] if 'schema' in _args else self.schema table = self._tablename(table) + _sql = "INSERT INTO :table (:fields) VALUES (:values)".replace(":table",table) #.replace(":table",self.table).replace(":fields",_fields) if type(info) == list : From ed66370fdff37b75cc590c1c817b9f322b4d718a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 May 2023 15:01:51 -0500 Subject: [PATCH 123/271] bug fix: ETL jobs and streamline --- transport/etl.py | 87 ++++++++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 36 deletions(-) diff --git a/transport/etl.py b/transport/etl.py index 83c6147..83018e6 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -91,20 +91,24 @@ class ETL (Process): super().__init__() self.name = _args['id'] if 'id' in _args else 'UNREGISTERED' - if 'provider' not in _args['source'] : - #@deprecate - self.reader = transport.factory.instance(**_args['source']) - else: - # - # This is the new interface - _args['source']['context'] = 'read' + # if 'provider' not in _args['source'] : + # #@deprecate + # self.reader = transport.factory.instance(**_args['source']) + # else: + # # + # # This is the new interface + # _args['source']['context'] = 'read' - self.reader = transport.instance(**_args['source']) + # self.reader = transport.instance(**_args['source']) + # # do we have an sql query provided or not .... # self.sql = _args['source']['sql'] if 'sql' in _args['source'] else None - self.cmd = _args['source']['cmd'] if 'cmd' in _args['source'] else None - self._oargs = _args['target'] #transport.factory.instance(**_args['target']) + # self.cmd = _args['source']['cmd'] if 'cmd' in _args['source'] else None + # self._oargs = _args['target'] #transport.factory.instance(**_args['target']) + self._source = dict(_args ['source'],**{'context':'write'}) + self._target = dict(_args['target'],**{'context':'read','lock':True}) + self.JOB_COUNT = _args['jobs'] self.jobs = [] # self.logger = transport.factory.instance(**_args['logger']) @@ -113,46 +117,57 @@ class ETL (Process): ETL.logger.info(**_args) def run(self): - if self.cmd : - idf = self.reader.read(**self.cmd) - else: - idf = self.reader.read() - idf = pd.DataFrame(idf) - # idf = idf.replace({np.nan: None}, inplace = True) + # if self.cmd : + # idf = self.reader.read(**self.cmd) + # else: + # idf = self.reader.read() + # idf = pd.DataFrame(idf) + # # idf = idf.replace({np.nan: None}, inplace = True) - idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()] - self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT) + # idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()] + # self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT) # # writing the data to a designated data source # try: - - - self.log(module='write',action='partitioning',jobs=self.JOB_COUNT) - rows = np.array_split(np.arange(0,idf.shape[0]),self.JOB_COUNT) - + _log = {"name":self.name,"rows":{"input":0,"output":0}} + _reader = transport.factory.instance(**self._source) + if 'table' in self._source : + _df = _reader.read() + else: + _df = _reader.read(**self._source['cmd']) + _log['rows']['input'] = _df.shape[0] # - # @TODO: locks - for i in np.arange(self.JOB_COUNT) : - # _id = ' '.join([str(i),' table ',self.name]) - indexes = rows[i] - segment = idf.loc[indexes,:].copy() #.to_dict(orient='records') - _name = "partition-"+str(i) - if segment.shape[0] == 0 : - continue + # Let's write the input data-frame to the target ... + _writer = transport.factory.instance(**self._target) + _writer.write(_df) + _log['rows']['output'] = _df.shape[0] + + # self.log(module='write',action='partitioning',jobs=self.JOB_COUNT) + # rows = np.array_split(np.arange(0,idf.shape[0]),self.JOB_COUNT) + + # # + # # @TODO: locks + # for i in np.arange(self.JOB_COUNT) : + # # _id = ' '.join([str(i),' table ',self.name]) + # indexes = rows[i] + # segment = idf.loc[indexes,:].copy() #.to_dict(orient='records') + # _name = "partition-"+str(i) + # if segment.shape[0] == 0 : + # continue - proc = Post(target = self._oargs,rows = segment,name=_name) - self.jobs.append(proc) - proc.start() + # proc = Post(target = self._oargs,rows = segment,name=_name) + # self.jobs.append(proc) + # proc.start() - self.log(module='write',action='working',segment=str(self.name),table=self.name,rows=segment.shape[0]) + # self.log(module='write',action='working',segment=str(self.name),table=self.name,rows=segment.shape[0]) # while self.jobs : # jobs = [job for job in proc if job.is_alive()] # time.sleep(1) except Exception as e: print (e) - + self.log(**_log) def is_done(self): self.jobs = [proc for proc in self.jobs if proc.is_alive()] return len(self.jobs) == 0 From 78ad8f3ad83ac0b64fd46d52c6bc7713a0c720e7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 May 2023 15:19:10 -0500 Subject: [PATCH 124/271] bug fix: etl transport --- transport/etl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transport/etl.py b/transport/etl.py index 83018e6..d509005 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -106,8 +106,8 @@ class ETL (Process): # self.sql = _args['source']['sql'] if 'sql' in _args['source'] else None # self.cmd = _args['source']['cmd'] if 'cmd' in _args['source'] else None # self._oargs = _args['target'] #transport.factory.instance(**_args['target']) - self._source = dict(_args ['source'],**{'context':'write'}) - self._target = dict(_args['target'],**{'context':'read','lock':True}) + self._source = _args['source'] #dict(_args ['source'],**{'context':'write'}) + self._target = _args['target'] #dict(_args['target'],**{'context':'read','lock':True}) self.JOB_COUNT = _args['jobs'] self.jobs = [] From 66f43a98c1230013ac39a9cc9f2ea75421be7ee2 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 30 May 2023 15:47:10 -0500 Subject: [PATCH 125/271] bug fixes: etl, mongodb lexicon --- transport/etl.py | 10 +++++++--- transport/mongo.py | 7 +++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/transport/etl.py b/transport/etl.py index d509005..9d520d4 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -106,9 +106,11 @@ class ETL (Process): # self.sql = _args['source']['sql'] if 'sql' in _args['source'] else None # self.cmd = _args['source']['cmd'] if 'cmd' in _args['source'] else None # self._oargs = _args['target'] #transport.factory.instance(**_args['target']) - self._source = _args['source'] #dict(_args ['source'],**{'context':'write'}) - self._target = _args['target'] #dict(_args['target'],**{'context':'read','lock':True}) - + self._source = _args ['source'] + self._target = _args['target'] + self._source['context'] = 'read' + self._target['context'] = 'write' + self.JOB_COUNT = _args['jobs'] self.jobs = [] # self.logger = transport.factory.instance(**_args['logger']) @@ -131,6 +133,8 @@ class ETL (Process): # writing the data to a designated data source # try: + + _log = {"name":self.name,"rows":{"input":0,"output":0}} _reader = transport.factory.instance(**self._source) if 'table' in self._source : diff --git a/transport/mongo.py b/transport/mongo.py index 8ab4418..ae07bce 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -37,7 +37,7 @@ class Mongo : self.mechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism'] # authSource=(args['authSource'] if 'authSource' in args else self.dbname) self._lock = False if 'lock' not in args else args['lock'] - + self.dbname = None username = password = None if 'auth_file' in args : _info = json.loads((open(args['auth_file'])).read()) @@ -46,17 +46,20 @@ class Mongo : else: _info = {} _args = dict(args,**_info) + _map = {'dbname':'db','database':'db','table':'uid','collection':'uid','col':'uid','doc':'uid'} for key in _args : if key in ['username','password'] : username = _args['username'] if key=='username' else username password = _args['password'] if key == 'password' else password continue value = _args[key] + if key in _map : + key = _map[key] self.setattr(key,value) # # Let us perform aliasing in order to remain backwards compatible - + self.dbname = self.db if hasattr(self,'db')else self.dbname self.uid = _args['table'] if 'table' in _args else (_args['doc'] if 'doc' in _args else (_args['collection'] if 'collection' in _args else None)) if username and password : From 7200be7beb3133b52f97dcf478235e6fb2eb9180 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 31 May 2023 14:13:36 -0500 Subject: [PATCH 126/271] bug fix: identifier filter --- bin/transport | 2 ++ setup.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/transport b/bin/transport index 7200e72..8103cfe 100755 --- a/bin/transport +++ b/bin/transport @@ -65,6 +65,8 @@ if __name__ == '__main__' : _index = int(SYS_ARGS['index']) _info = [_item for _item in _info if _info.index(_item) == _index] pass + elif 'id' in SYS_ARGS : + _info = [_item for _item in _info if 'id' in _item and _item['id'] == SYS_ARGS['id']] procs = 1 if 'procs' not in SYS_ARGS else int(SYS_ARGS['procs']) jobs = transport.factory.instance(provider='etl',info=_info,procs=procs) diff --git a/setup.py b/setup.py index 483ea87..6a078bc 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.6.8", + "version":"1.7.0", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} From 2cee6db63b560d7cc3083110fb17dd0a328e76b5 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 31 May 2023 14:20:00 -0500 Subject: [PATCH 127/271] bug fix: ... --- bin/transport | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bin/transport b/bin/transport index 8103cfe..8edaecc 100755 --- a/bin/transport +++ b/bin/transport @@ -70,12 +70,15 @@ if __name__ == '__main__' : procs = 1 if 'procs' not in SYS_ARGS else int(SYS_ARGS['procs']) jobs = transport.factory.instance(provider='etl',info=_info,procs=procs) + print ([len(jobs),' Jobs are running']) + N = len(jobs) while jobs : x = len(jobs) jobs = [_job for _job in jobs if _job.is_alive()] if x != len(jobs) : - print ([len(jobs),'... jobs running']) + print ([len(jobs),'... jobs still running']) time.sleep(1) + print ([N,' Finished running']) except Exception as e: print (e) From 18cd1d8b11fd08095ba06078a4fcc623fedc6ee4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 16 Jun 2023 16:07:29 -0500 Subject: [PATCH 128/271] bug fix with sql handling sqlalchemy --- setup.py | 2 +- transport/sql.py | 64 +++++++++++++++++++++++++++++------------------- 2 files changed, 40 insertions(+), 26 deletions(-) diff --git a/setup.py b/setup.py index 6a078bc..a535591 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.7.0", + "version":"1.7.2", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/sql.py b/transport/sql.py index 05d254d..27d5888 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -123,28 +123,31 @@ class SQLRW : return self.schema +'.'+name if self.schema not in [None, ''] and '.' not in name else name def has(self,**_args): - found = False - try: + return self.meta(**_args) + # found = False + # try: - table = self._tablename(_args['table'])if 'table' in _args else self._tablename(self.table) - sql = "SELECT * FROM :table LIMIT 1".replace(":table",table) - if self._engine : - _conn = self._engine.connect() - else: - _conn = self.conn - found = pd.read_sql(sql,_conn).shape[0] - found = True + # table = self._tablename(_args['table'])if 'table' in _args else self._tablename(self.table) + # sql = "SELECT * FROM :table LIMIT 1".replace(":table",table) + # if self._engine : + # _conn = self._engine.connect() + # else: + # _conn = self.conn + # found = pd.read_sql(sql,_conn).shape[0] + # found = True - except Exception as e: - pass - finally: - if self._engine : - _conn.close() - return found + # except Exception as e: + # print (e) + # pass + # finally: + # if not self._engine : + # _conn.close() + # return found def isready(self): _sql = "SELECT * FROM :table LIMIT 1".replace(":table",self.table) try: - return pd.read_sql(_sql,self.conn).columns.tolist() + _conn = self.conn if not hasattr(self,'_engine') else self._engine + return pd.read_sql(_sql,_conn).columns.tolist() except Exception as e: pass return False @@ -154,22 +157,24 @@ class SQLRW : :param _sql insert/select statement @TODO: Store procedure calls """ - cursor = self.conn.cursor() + # _out = None try: - if "select" in _sql.lower() : + if _sql.lower().startswith('select') : - # _conn = self._engine if self._engine else self.conn - return pd.read_sql(_sql,self.conn) + _conn = self._engine if self._engine else self.conn + return pd.read_sql(_sql,_conn) else: # Executing a command i.e no expected return values ... + cursor = self.conn.cursor() cursor.execute(_sql) self.conn.commit() except Exception as e : print (e) finally: - self.conn.commit() - cursor.close() + if not self._engine : + self.conn.commit() + # cursor.close() def close(self): try: self.conn.close() @@ -184,12 +189,21 @@ class SQLReader(SQLRW,Reader) : if 'sql' in _args : _sql = (_args['sql']) else: - table = self.table if self.table is not None else _args['table'] + if 'table' in _args : + table = _args['table'] + else: + table = self.table + # table = self.table if self.table is not None else _args['table'] _sql = "SELECT :fields FROM "+self._tablename(table) if 'filter' in _args : _sql = _sql +" WHERE "+_args['filter'] - _fields = '*' if not self.fields else ",".join(self.fields) + if 'fields' in _args : + _fields = _args['fields'] + else: + _fields = '*' if not self.fields else ",".join(self.fields) _sql = _sql.replace(":fields",_fields) + # + # At this point we have a query we can execute gracefully if 'limit' in _args : _sql = _sql + " LIMIT "+str(_args['limit']) # From aaaf64db33d4aa94f7a4c777435a6d4aef92d98d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 16 Jun 2023 17:39:02 -0500 Subject: [PATCH 129/271] bug fix: versions of sqlalchemy for meta data --- transport/sql.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/transport/sql.py b/transport/sql.py index 27d5888..a262c3a 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -106,8 +106,13 @@ class SQLRW : try: if self._engine : table = _args['table'] if 'table' in _args else self.table - _m = sqlalchemy.MetaData(bind=self._engine) - _m.reflect() + if sqlalchemy.__version__.startswith('1.') : + _m = sqlalchemy.MetaData(bind=self._engine) + _m.reflect() + else: + + _m = sqlalchemy.MetaData() + _m.reflect(bind=self._engine) schema = [{"name":_attr.name,"type":str(_attr.type)} for _attr in _m.tables[table].columns] # # Some house keeping work @@ -117,6 +122,7 @@ class SQLRW : _item['type'] = _m[_item['type']] except Exception as e: + print (e) pass return schema def _tablename(self,name) : From 74d8b2c6f29f79a6c95fe4a25727544e2a21cf25 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 5 Jul 2023 14:51:14 -0500 Subject: [PATCH 130/271] bug fix --- setup.py | 2 +- transport/sql.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index a535591..8e1b424 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.7.2", + "version":"1.7.4", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/sql.py b/transport/sql.py index a262c3a..0990bcd 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -380,12 +380,14 @@ class BigQuery: :param table name of the name WITHOUT including dataset :param sql sql query to be pulled, """ - table = _args['table'] + table = _args['table'] if 'table' in _args else self.table try: - ref = self.client.dataset(self.dataset).table(table) - _schema = self.client.get_table(ref).schema - return [{"name":_item.name,"type":_item.field_type,"description":( "" if not hasattr(_item,"description") else _item.description )} for _item in _schema] - + if table : + ref = self.client.dataset(self.dataset).table(table) + _schema = self.client.get_table(ref).schema + return [{"name":_item.name,"type":_item.field_type,"description":( "" if not hasattr(_item,"description") else _item.description )} for _item in _schema] + else : + return [] except Exception as e: return [] def has(self,**_args): From b27c5a88d4960b3b4faf9a5099c0b6bf78fa5e07 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 5 Jul 2023 15:53:54 -0500 Subject: [PATCH 131/271] bug fix: bigquery meta on a project other than the one with service key --- transport/sql.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/transport/sql.py b/transport/sql.py index 0990bcd..00499e5 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -381,14 +381,20 @@ class BigQuery: :param sql sql query to be pulled, """ table = _args['table'] if 'table' in _args else self.table + try: if table : - ref = self.client.dataset(self.dataset).table(table) - _schema = self.client.get_table(ref).schema - return [{"name":_item.name,"type":_item.field_type,"description":( "" if not hasattr(_item,"description") else _item.description )} for _item in _schema] + _dataset = self.dataset if 'dataset' not in _args else _args['dataset'] + sql = f"""SELECT column_name as table_name, data_type as field_type FROM {_dataset}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{table}' """ + return self.read(sql=sql).to_dict(orient='records') + # ref = self.client.dataset(self.dataset).table(table) + + # _schema = self.client.get_table(ref).schema + # return [{"name":_item.name,"type":_item.field_type,"description":( "" if not hasattr(_item,"description") else _item.description )} for _item in _schema] else : return [] except Exception as e: + return [] def has(self,**_args): found = False From 687b211b7b0cfe53b6e31d8c58d1bb4d4085023a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 5 Jul 2023 15:58:51 -0500 Subject: [PATCH 132/271] bug fix: field_name (typographic error) --- transport/sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/sql.py b/transport/sql.py index 00499e5..2977044 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -385,7 +385,7 @@ class BigQuery: try: if table : _dataset = self.dataset if 'dataset' not in _args else _args['dataset'] - sql = f"""SELECT column_name as table_name, data_type as field_type FROM {_dataset}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{table}' """ + sql = f"""SELECT column_name as field_name, data_type as field_type FROM {_dataset}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{table}' """ return self.read(sql=sql).to_dict(orient='records') # ref = self.client.dataset(self.dataset).table(table) From e9c02d558a1b12fa55a25b644bed66971c9065af Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 5 Jul 2023 15:59:13 -0500 Subject: [PATCH 133/271] version update --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8e1b424..2b17e62 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.7.4", + "version":"1.7.6", "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} From 712c3d076d6d5224a403cf5653550ce746abcb96 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 9 Jul 2023 16:32:05 -0500 Subject: [PATCH 134/271] bug fix: schema specification in sql handling --- setup.py | 3 ++- transport/sql.py | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 2b17e62..136082a 100644 --- a/setup.py +++ b/setup.py @@ -4,11 +4,12 @@ This is a build file for the from setuptools import setup, find_packages import os import sys +from transport import __version__ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", - "version":"1.7.6", + "version":__version__, "author":"The Phi Technology LLC","author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} diff --git a/transport/sql.py b/transport/sql.py index 2977044..b387bcf 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -298,7 +298,17 @@ class SQLWriter(SQLRW,Writer): try: table = _args['table'] if 'table' in _args else self.table - self.schema = _args['schema'] if 'schema' in _args else self.schema + # + # In SQL, schema can stand for namespace or the structure of a table + # In case we have a list, we are likely dealing with table structure + # + if 'schema' in _args : + if type(_args['schema']) == str : + self.schema = _args['schema'] if 'schema' in _args else self.schema + elif type(_args['schema']) == list: + self.make(schema=_args['schema']) + pass + # self.schema = _args['schema'] if 'schema' in _args else self.schema table = self._tablename(table) _sql = "INSERT INTO :table (:fields) VALUES (:values)".replace(":table",table) #.replace(":table",self.table).replace(":fields",_fields) @@ -385,7 +395,7 @@ class BigQuery: try: if table : _dataset = self.dataset if 'dataset' not in _args else _args['dataset'] - sql = f"""SELECT column_name as field_name, data_type as field_type FROM {_dataset}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{table}' """ + sql = f"""SELECT column_name as name, data_type as type FROM {_dataset}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{table}' """ return self.read(sql=sql).to_dict(orient='records') # ref = self.client.dataset(self.dataset).table(table) From fb20687a461ea55b981561ca64ed7ca438cd63b1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 9 Jul 2023 16:37:30 -0500 Subject: [PATCH 135/271] version update --- transport/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transport/__init__.py b/transport/__init__.py index f43a184..90204da 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -21,6 +21,7 @@ The configuration for the data-store is as follows : provider:'mongodb',[port:27017],[host:localhost],db:,doc:<_name>,context: """ __author__ = 'The Phi Technology' +__version__= '1.7.8' import pandas as pd import numpy as np import json From f3f7bf9754fb1c4fa1418f422cfb38574d5cd92a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 9 Jul 2023 17:36:41 -0500 Subject: [PATCH 136/271] bug fix --- transport/sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/sql.py b/transport/sql.py index b387bcf..bf34c76 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -306,7 +306,7 @@ class SQLWriter(SQLRW,Writer): if type(_args['schema']) == str : self.schema = _args['schema'] if 'schema' in _args else self.schema elif type(_args['schema']) == list: - self.make(schema=_args['schema']) + self.make(table=table,schema=_args['schema']) pass # self.schema = _args['schema'] if 'schema' in _args else self.schema table = self._tablename(table) From 5f1592b973383db1aa07add55b6a23356f1555c7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 9 Jul 2023 17:36:41 -0500 Subject: [PATCH 137/271] bug fix --- transport/sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/sql.py b/transport/sql.py index b387bcf..bf34c76 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -306,7 +306,7 @@ class SQLWriter(SQLRW,Writer): if type(_args['schema']) == str : self.schema = _args['schema'] if 'schema' in _args else self.schema elif type(_args['schema']) == list: - self.make(schema=_args['schema']) + self.make(table=table,schema=_args['schema']) pass # self.schema = _args['schema'] if 'schema' in _args else self.schema table = self._tablename(table) From f57cd8d871c7d692ec476276b87690360817d3d2 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 9 Jul 2023 17:46:01 -0500 Subject: [PATCH 138/271] version update --- transport/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/__init__.py b/transport/__init__.py index 90204da..7c767a2 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -21,7 +21,7 @@ The configuration for the data-store is as follows : provider:'mongodb',[port:27017],[host:localhost],db:,doc:<_name>,context: """ __author__ = 'The Phi Technology' -__version__= '1.7.8' +__version__= '1.7.84' import pandas as pd import numpy as np import json From 1885e3a5f64ce659d3ab56e85c8e1fea52ed5605 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 9 Jul 2023 17:51:09 -0500 Subject: [PATCH 139/271] bug fix, making table when provided a schema --- transport/sql.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transport/sql.py b/transport/sql.py index bf34c76..a4a66c8 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -129,7 +129,7 @@ class SQLRW : return self.schema +'.'+name if self.schema not in [None, ''] and '.' not in name else name def has(self,**_args): - return self.meta(**_args) + return len(self.meta(**_args)) > 0 # found = False # try: @@ -305,7 +305,7 @@ class SQLWriter(SQLRW,Writer): if 'schema' in _args : if type(_args['schema']) == str : self.schema = _args['schema'] if 'schema' in _args else self.schema - elif type(_args['schema']) == list: + elif type(_args['schema']) == list and not self.has(table): self.make(table=table,schema=_args['schema']) pass # self.schema = _args['schema'] if 'schema' in _args else self.schema From ad84e24533b47aa8c65d8fdb036147eac971b7cf Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 9 Jul 2023 17:51:47 -0500 Subject: [PATCH 140/271] version update --- transport/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/__init__.py b/transport/__init__.py index 7c767a2..cb22924 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -21,7 +21,7 @@ The configuration for the data-store is as follows : provider:'mongodb',[port:27017],[host:localhost],db:,doc:<_name>,context: """ __author__ = 'The Phi Technology' -__version__= '1.7.84' +__version__= '1.8.0' import pandas as pd import numpy as np import json From b5794e0a610f03b96642b082b1de1bc8de675d21 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 9 Jul 2023 17:58:32 -0500 Subject: [PATCH 141/271] bug fixes has function --- transport/sql.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/transport/sql.py b/transport/sql.py index a4a66c8..ea58c9e 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -305,7 +305,7 @@ class SQLWriter(SQLRW,Writer): if 'schema' in _args : if type(_args['schema']) == str : self.schema = _args['schema'] if 'schema' in _args else self.schema - elif type(_args['schema']) == list and not self.has(table): + elif type(_args['schema']) == list and not self.has(table=table): self.make(table=table,schema=_args['schema']) pass # self.schema = _args['schema'] if 'schema' in _args else self.schema @@ -409,7 +409,8 @@ class BigQuery: def has(self,**_args): found = False try: - found = self.meta(**_args) is not None + _has = self.meta(**_args) + found = _has is not None and len(_has) > 0 except Exception as e: pass return found From 0933e4155f365274e8c33902cca414bdef646ab7 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 9 Jul 2023 18:03:03 -0500 Subject: [PATCH 142/271] bug fixes has function --- transport/sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/sql.py b/transport/sql.py index ea58c9e..0250553 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -129,7 +129,7 @@ class SQLRW : return self.schema +'.'+name if self.schema not in [None, ''] and '.' not in name else name def has(self,**_args): - return len(self.meta(**_args)) > 0 + return self.meta(**_args) # found = False # try: From 653d22cd8c853ecd53d181c997e9bd8a548249de Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 9 Jul 2023 20:47:54 -0500 Subject: [PATCH 143/271] bug fix: init function for fields not working --- transport/sql.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transport/sql.py b/transport/sql.py index 0250553..38a1f7e 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -239,6 +239,8 @@ class SQLWriter(SQLRW,Writer): try: table = self._tablename(self.table) self.fields = pd.read_sql_query("SELECT * FROM :table LIMIT 1".replace(":table",table),self.conn).columns.tolist() + except Exception as e: + pass finally: pass else: From 8f94ea260bed6e68f7c4535a4975c6a925865c7a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 9 Jul 2023 20:58:00 -0500 Subject: [PATCH 144/271] bug fix: initialization of fields unnecessary --- transport/sql.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/transport/sql.py b/transport/sql.py index 38a1f7e..5357852 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -235,16 +235,16 @@ class SQLWriter(SQLRW,Writer): self._cast = False if 'cast' not in _args else _args['cast'] def init(self,fields=None): - if not fields : - try: - table = self._tablename(self.table) - self.fields = pd.read_sql_query("SELECT * FROM :table LIMIT 1".replace(":table",table),self.conn).columns.tolist() - except Exception as e: - pass - finally: - pass - else: - self.fields = fields; + # if not fields : + # try: + # table = self._tablename(self.table) + # self.fields = pd.read_sql_query("SELECT * FROM :table LIMIT 1".replace(":table",table),self.conn).columns.tolist() + # except Exception as e: + # pass + # finally: + # pass + # else: + self.fields = fields; def make(self,**_args): table = self._tablename(self.table) if 'table' not in _args else self._tablename(_args['table']) From 05d990d7138f27c5b416c5776a4c653ce6544188 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 9 Jul 2023 21:22:31 -0500 Subject: [PATCH 145/271] bug fix: table creation with schema (if any is provided) --- transport/sql.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/transport/sql.py b/transport/sql.py index 5357852..0d74fdc 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -307,7 +307,10 @@ class SQLWriter(SQLRW,Writer): if 'schema' in _args : if type(_args['schema']) == str : self.schema = _args['schema'] if 'schema' in _args else self.schema - elif type(_args['schema']) == list and not self.has(table=table): + elif type(_args['schema']) == list and len(_args['schema']) > 0 and not self.has(table=table): + # + # There is a messed up case when an empty array is passed (no table should be created) + # self.make(table=table,schema=_args['schema']) pass # self.schema = _args['schema'] if 'schema' in _args else self.schema From fe21f44388218ce77732991ca2c7bce16bd68feb Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 17 Jul 2023 15:42:42 -0500 Subject: [PATCH 146/271] bug fix version --- setup.py | 7 +++++-- transport/__init__.py | 5 +++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 136082a..8e0bc46 100644 --- a/setup.py +++ b/setup.py @@ -4,13 +4,16 @@ This is a build file for the from setuptools import setup, find_packages import os import sys -from transport import __version__ +from transport.version import __version__,__author__ +# __author__ = 'The Phi Technology' +# __version__= '1.8.0' + def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":"data-transport", "version":__version__, - "author":"The Phi Technology LLC","author_email":"info@the-phi.com", + "author":__author__,"author_email":"info@the-phi.com", "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] diff --git a/transport/__init__.py b/transport/__init__.py index cb22924..57c3f22 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -20,8 +20,7 @@ The configuration for the data-store is as follows : mongodb provider:'mongodb',[port:27017],[host:localhost],db:,doc:<_name>,context: """ -__author__ = 'The Phi Technology' -__version__= '1.8.0' + import pandas as pd import numpy as np import json @@ -38,6 +37,7 @@ if sys.version_info[0] > 2 : from transport import mongo as mongo from transport import sql as sql from transport import etl as etl + from transport.version import __version__ else: from common import Reader, Writer,Console #, factory import disk @@ -47,6 +47,7 @@ else: import s3 import sql import etl + from version import __version__ import psycopg2 as pg import mysql.connector as my from google.cloud import bigquery as bq From f4c0ca80aa06ae38320583053fb6b4002114432b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 17 Jul 2023 15:46:14 -0500 Subject: [PATCH 147/271] bug fix version --- transport/version.py | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 transport/version.py diff --git a/transport/version.py b/transport/version.py new file mode 100644 index 0000000..07d7737 --- /dev/null +++ b/transport/version.py @@ -0,0 +1,2 @@ +__author__ = 'The Phi Technology' +__version__= '1.8.0' \ No newline at end of file From 5ffdf052ee49902dc3f388c10d6e87428efee164 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 17 Jul 2023 15:50:13 -0500 Subject: [PATCH 148/271] bug fix --- setup.py | 2 +- version.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 120000 version.py diff --git a/setup.py b/setup.py index 8e0bc46..c3a8547 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ This is a build file for the from setuptools import setup, find_packages import os import sys -from transport.version import __version__,__author__ +from version import __version__,__author__ # __author__ = 'The Phi Technology' # __version__= '1.8.0' diff --git a/version.py b/version.py new file mode 120000 index 0000000..e666b28 --- /dev/null +++ b/version.py @@ -0,0 +1 @@ +transport/version.py \ No newline at end of file From b5dc44a79754e7e95f5785db499d906f0c2eb132 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 25 Jul 2023 16:06:20 -0500 Subject: [PATCH 149/271] bug fix: sqlalchemy version --- setup.py | 2 +- transport/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index c3a8547..7eff1e4 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ args = { "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pymongo','sqlalchemy','pandas','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] +args["install_requires"] = ['pymongo','sqlalchemy<2.0.0','pandas','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] if sys.version_info[0] == 2 : diff --git a/transport/version.py b/transport/version.py index 07d7737..a8f9b1f 100644 --- a/transport/version.py +++ b/transport/version.py @@ -1,2 +1,2 @@ __author__ = 'The Phi Technology' -__version__= '1.8.0' \ No newline at end of file +__version__= '1.8.1' From 3b5c9d15035cda5b2fa723e6731a98525e878995 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 28 Jul 2023 09:09:18 -0500 Subject: [PATCH 150/271] bugfix: reformatting factory singleton and adding python queue as qlistener --- transport/__init__.py | 148 ++++++++++++++++++++++++++++++++++------- transport/mongo.py | 5 +- transport/providers.py | 63 ++++++++++++++++++ transport/qlistener.py | 42 ++++++++++++ transport/sql.py | 13 ++-- 5 files changed, 242 insertions(+), 29 deletions(-) create mode 100644 transport/providers.py create mode 100644 transport/qlistener.py diff --git a/transport/__init__.py b/transport/__init__.py index 57c3f22..8a45800 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -38,6 +38,7 @@ if sys.version_info[0] > 2 : from transport import sql as sql from transport import etl as etl from transport.version import __version__ + from transport import providers else: from common import Reader, Writer,Console #, factory import disk @@ -48,37 +49,39 @@ else: import sql import etl from version import __version__ + import providers import psycopg2 as pg import mysql.connector as my from google.cloud import bigquery as bq import nzpy as nz #--- netezza drivers import os -class providers : - POSTGRESQL = 'postgresql' - MONGODB = 'mongodb' +# class providers : +# POSTGRESQL = 'postgresql' +# MONGODB = 'mongodb' - BIGQUERY ='bigquery' - FILE = 'file' - ETL = 'etl' - SQLITE = 'sqlite' - SQLITE3= 'sqlite' - REDSHIFT = 'redshift' - NETEZZA = 'netezza' - MYSQL = 'mysql' - RABBITMQ = 'rabbitmq' - MARIADB = 'mariadb' - COUCHDB = 'couch' - CONSOLE = 'console' - ETL = 'etl' - # - # synonyms of the above - BQ = BIGQUERY - MONGO = MONGODB - FERRETDB= MONGODB - PG = POSTGRESQL - PSQL = POSTGRESQL - PGSQL = POSTGRESQL +# BIGQUERY ='bigquery' +# FILE = 'file' +# ETL = 'etl' +# SQLITE = 'sqlite' +# SQLITE3= 'sqlite' +# REDSHIFT = 'redshift' +# NETEZZA = 'netezza' +# MYSQL = 'mysql' +# RABBITMQ = 'rabbitmq' +# MARIADB = 'mariadb' +# COUCHDB = 'couch' +# CONSOLE = 'console' +# ETL = 'etl' +# # +# # synonyms of the above +# BQ = BIGQUERY +# MONGO = MONGODB +# FERRETDB= MONGODB +# PG = POSTGRESQL +# PSQL = POSTGRESQL +# PGSQL = POSTGRESQL +# import providers class IEncoder (json.JSONEncoder): def default (self,object): @@ -156,6 +159,103 @@ class factory : import time def instance(**_args): """ + creating an instance given the provider, we should have an idea of :class, :driver + :provider + :read|write = {connection to the database} + """ + _provider = _args['provider'] + _group = None + + for _id in providers.CATEGORIES : + if _provider in providers.CATEGORIES[_id] : + _group = _id + break + if _group : + _classPointer = _getClassInstance(_group,**_args) + # + # Let us reformat the arguments + if 'read' in _args or 'write' in _args : + _args = _args['read'] if 'read' in _args else _args['write'] + _args['provider'] = _provider + if _group == 'sql' : + _info = _get_alchemyEngine(**_args) + + _args = dict(_args,**_info) + _args['driver'] = providers.DRIVERS[_provider] + + else: + if _provider in providers.DEFAULT : + _default = providers.DEFAULT[_provider] + _defkeys = list(set(_default.keys()) - set(_args.keys())) + if _defkeys : + for key in _defkeys : + _args[key] = _default[key] + pass + # + # get default values from + + return _classPointer(**_args) + # + # Let us determine the category of the provider that has been given +def _get_alchemyEngine(**_args): + """ + This function returns the SQLAlchemy engine associated with parameters, This is only applicable for SQL _items + :_args arguments passed to the factory {provider and other} + """ + #@TODO: Enable authentication files (private_key) + _username = _args['username'] if 'username' in _args else '' + _password = _args['password'] if 'password' in _args else '' + _account = _args['account'] if 'account' in _args else '' + _database = _args['database'] + _provider = _args['provider'] + if _username != '': + _account = _username + ':'+_password+'@' + _host = _args['host'] if 'host' in _args else '' + _port = _args['port'] if 'port' in _args else '' + if _provider in providers.DEFAULT : + _default = providers.DEFAULT[_provider] + _host = _host if _host != '' else (_default['host'] if 'host' in _default else '') + _port = _port if _port != '' else (_default['port'] if 'port' in _default else '') + if _port == '': + _port = providers.DEFAULT['port'] if 'port' in providers.DEFAULT else '' + # + + if _host != '' and _port != '' : + _fhost = _host+":"+str(_port) #--formatted hostname + else: + _fhost = _host + # Let us update the parameters we have thus far + # + + + uri = ''.join([_provider,"://",_account,_fhost,'/',_database]) + + _engine = sqlalchemy.create_engine (uri,future=True) + _out = {'sqlalchemy':_engine} + _pargs = {'host':_host,'port':_port,'username':_username,'password':_password} + for key in _pargs : + if _pargs[key] != '' : + _out[key] = _pargs[key] + return _out +def _getClassInstance(_group,**_args): + """ + This function returns the class instance we are attempting to instanciate + :_group items in providers.CATEGORIES.keys() + :_args arguments passed to the factory class + """ + if 'read' in _args or 'write' in _args : + _context = 'read' if 'read' in _args else _args['write'] + _info = _args[_context] + else: + _context = _args['context'] if 'context' in _args else 'read' + _class = providers.READ[_group] if _context == 'read' else providers.WRITE[_group] + if type(_class) == dict and _args['provider'] in _class: + _class = _class[_args['provider']] + + return _class + +def __instance(**_args): + """ @param provider {file,sqlite,postgresql,redshift,bigquery,netezza,mongo,couch ...} @param context read|write|rw diff --git a/transport/mongo.py b/transport/mongo.py index ae07bce..96c9075 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -127,6 +127,8 @@ class MongoReader(Mongo,Reader): return pd.DataFrame(r) else: + + if 'table' in args or 'collection' in args : if 'table' in args: _uid = args['table'] @@ -134,7 +136,8 @@ class MongoReader(Mongo,Reader): _uid = args['collection'] else: _uid = self.uid - + else: + _uid = self.uid collection = self.db[_uid] _filter = args['filter'] if 'filter' in args else {} _df = pd.DataFrame(collection.find(_filter)) diff --git a/transport/providers.py b/transport/providers.py new file mode 100644 index 0000000..cf5ed59 --- /dev/null +++ b/transport/providers.py @@ -0,0 +1,63 @@ +from transport.common import Reader, Writer,Console #, factory +from transport import disk +import sqlite3 +from transport import s3 as s3 +from transport import rabbitmq as queue +from transport import couch as couch +from transport import mongo as mongo +from transport import sql as sql +from transport import etl as etl +from transport import qlistener +import psycopg2 as pg +import mysql.connector as my +from google.cloud import bigquery as bq +import nzpy as nz #--- netezza drivers +import os + +from transport.version import __version__ + +POSTGRESQL = 'postgresql' +MONGODB = 'mongodb' +HTTP='http' +BIGQUERY ='bigquery' +FILE = 'file' +ETL = 'etl' +SQLITE = 'sqlite' +SQLITE3= 'sqlite' +REDSHIFT = 'redshift' +NETEZZA = 'netezza' +MYSQL = 'mysql' +RABBITMQ = 'rabbitmq' +MARIADB = 'mariadb' +COUCHDB = 'couch' +CONSOLE = 'console' +ETL = 'etl' +# +# synonyms of the above +BQ = BIGQUERY +MONGO = MONGODB +FERRETDB= MONGODB +PG = POSTGRESQL +PSQL = POSTGRESQL +PGSQL = POSTGRESQL +S3 = 's3' +AWS_S3 = 's3' +RABBIT = RABBITMQ + +QLISTENER = 'qlistener' + +DRIVERS = {PG:pg,REDSHIFT:pg,MYSQL:my,MARIADB:my,NETEZZA:nz,SQLITE:sqlite3} +CATEGORIES ={'sql':[NETEZZA,PG,MYSQL,REDSHIFT,SQLITE,MARIADB],'nosql':[MONGODB,COUCHDB],'cloud':[BIGQUERY],'file':[FILE], + 'queue':[RABBIT,QLISTENER],'memory':[CONSOLE,QLISTENER],'http':[HTTP]} + +READ = {'sql':sql.SQLReader,'nosql':{MONGODB:mongo.MongoReader,COUCHDB:couch.CouchReader},'cloud':sql.BigQueryReader, + 'file':disk.DiskReader,'queue':{RABBIT:queue.QueueReader,QLISTENER:qlistener.qListener} + } +WRITE = {'sql':sql.SQLWriter,'nosql':{MONGODB:mongo.MongoWriter,COUCHDB:couch.CouchWriter},'cloud':sql.BigQueryWriter, + 'file':disk.DiskWriter,'queue':{RABBIT:queue.QueueWriter,QLISTENER:qlistener.qListener} + } +DEFAULT = {PG:{'host':'localhost','port':5432},MYSQL:{'host':'localhost','port':3306}} +DEFAULT[MONGODB] = {'port':27017,'host':'localhost'} +DEFAULT[REDSHIFT] = DEFAULT[PG] +DEFAULT[MARIADB] = DEFAULT[MYSQL] +DEFAULT[NETEZZA] = {'port':5480} \ No newline at end of file diff --git a/transport/qlistener.py b/transport/qlistener.py new file mode 100644 index 0000000..495b731 --- /dev/null +++ b/transport/qlistener.py @@ -0,0 +1,42 @@ +import queue +from threading import Thread, Lock +from transport.common import Reader,Writer +import numpy as np +import pandas as pd + +class qListener : + lock = Lock() + _queue = {'default':queue.Queue()} + def __init__(self,**_args): + self._cache = {} + self._callback = _args['callback'] if 'callback' in _args else None + self._id = _args['id'] if 'id' in _args else 'default' + if self._id not in qListener._queue : + qListener._queue[self._id] = queue.Queue() + thread = Thread(target=self._forward) + thread.start() + def _forward(self): + _q = qListener._queue[self._id] + _data = _q.get() + _q.task_done() + self._callback(_data) + + def has(self,**_args) : + return self._callback is not None + + + def close(self): + """ + This will empty the queue and have it ready for another operation + """ + _q = qListener._queue[self._id] + with _q.mutex: + _q.queue.clear() + _q.all_tasks_done.notify_all() + + def write(self,_data,**_args): + _id = _args['id'] if 'id' in _args else self._id + + _q = qListener._queue[_id] + _q.put(_data) + _q.join() diff --git a/transport/sql.py b/transport/sql.py index 0d74fdc..ffabb54 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -29,6 +29,7 @@ from multiprocessing import Lock, RLock import pandas as pd import numpy as np import nzpy as nz #--- netezza drivers +import sqlite3 import copy import os @@ -58,8 +59,8 @@ class SQLRW : # _info['host'] = 'localhost' if 'host' not in _args else _args['host'] # _info['port'] = SQLWriter.REFERENCE[_provider]['port'] if 'port' not in _args else _args['port'] - _info['host'] = _args['host'] - _info['port'] = _args['port'] + _info['host'] = _args['host'] if 'host' in _args else '' + _info['port'] = _args['port'] if 'port' in _args else '' # if 'host' in _args : # _info['host'] = 'localhost' if 'host' not in _args else _args['host'] @@ -98,8 +99,12 @@ class SQLRW : if _handler == my : _info['database'] = _info['dbname'] del _info['dbname'] - - self.conn = _handler.connect(**_info) + if _handler == sqlite3 : + _info = {'path':_info['dbname'],'isolation_level':'IMMEDIATE'} + if _handler != sqlite3 : + self.conn = _handler.connect(**_info) + else: + self.conn = _handler.connect(_info['path'],isolation_level='IMMEDIATE') self._engine = _args['sqlalchemy'] if 'sqlalchemy' in _args else None def meta(self,**_args): schema = [] From 6fedef41a3f4beb170f7ad5481f587f8f9eb1171 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 28 Jul 2023 09:10:18 -0500 Subject: [PATCH 151/271] version update --- transport/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/version.py b/transport/version.py index 07d7737..9db71e5 100644 --- a/transport/version.py +++ b/transport/version.py @@ -1,2 +1,2 @@ __author__ = 'The Phi Technology' -__version__= '1.8.0' \ No newline at end of file +__version__= '1.8.2' From 98e0aee929ebb97af26e13ea90a0632c6ad59278 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 28 Jul 2023 09:13:09 -0500 Subject: [PATCH 152/271] version update --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8e0bc46..6682394 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ args = { "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pymongo','sqlalchemy','pandas','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] +args["install_requires"] = ['pymongo','sqlalchemy<2.0.0','pandas','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] if sys.version_info[0] == 2 : From 61da01f9e9d13c40d1f4b28f87ae261270f9f551 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 8 Aug 2023 10:26:52 -0500 Subject: [PATCH 153/271] bug fixes: console/cli handling --- transport/providers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transport/providers.py b/transport/providers.py index cf5ed59..a7de3d9 100644 --- a/transport/providers.py +++ b/transport/providers.py @@ -54,7 +54,8 @@ READ = {'sql':sql.SQLReader,'nosql':{MONGODB:mongo.MongoReader,COUCHDB:couch.Cou 'file':disk.DiskReader,'queue':{RABBIT:queue.QueueReader,QLISTENER:qlistener.qListener} } WRITE = {'sql':sql.SQLWriter,'nosql':{MONGODB:mongo.MongoWriter,COUCHDB:couch.CouchWriter},'cloud':sql.BigQueryWriter, - 'file':disk.DiskWriter,'queue':{RABBIT:queue.QueueWriter,QLISTENER:qlistener.qListener} + 'file':disk.DiskWriter,'queue':{RABBIT:queue.QueueWriter,QLISTENER:qlistener.qListener},'cli':{CONSOLE:Console},'memory':{CONSOLE:Console} + } DEFAULT = {PG:{'host':'localhost','port':5432},MYSQL:{'host':'localhost','port':3306}} DEFAULT[MONGODB] = {'port':27017,'host':'localhost'} From 55317e52573811f4425481a74c09a728dcc0571e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 8 Aug 2023 10:29:55 -0500 Subject: [PATCH 154/271] bug fix --- transport/providers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transport/providers.py b/transport/providers.py index a7de3d9..4fe4784 100644 --- a/transport/providers.py +++ b/transport/providers.py @@ -51,7 +51,8 @@ CATEGORIES ={'sql':[NETEZZA,PG,MYSQL,REDSHIFT,SQLITE,MARIADB],'nosql':[MONGODB,C 'queue':[RABBIT,QLISTENER],'memory':[CONSOLE,QLISTENER],'http':[HTTP]} READ = {'sql':sql.SQLReader,'nosql':{MONGODB:mongo.MongoReader,COUCHDB:couch.CouchReader},'cloud':sql.BigQueryReader, - 'file':disk.DiskReader,'queue':{RABBIT:queue.QueueReader,QLISTENER:qlistener.qListener} + 'file':disk.DiskReader,'queue':{RABBIT:queue.QueueReader,QLISTENER:qlistener.qListener}, + 'cli':{CONSOLE:Console},'memory':{CONSOLE:Console} } WRITE = {'sql':sql.SQLWriter,'nosql':{MONGODB:mongo.MongoWriter,COUCHDB:couch.CouchWriter},'cloud':sql.BigQueryWriter, 'file':disk.DiskWriter,'queue':{RABBIT:queue.QueueWriter,QLISTENER:qlistener.qListener},'cli':{CONSOLE:Console},'memory':{CONSOLE:Console} From bcb65d873a82d1cf3e31455588d364cbfb04e009 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 8 Aug 2023 10:26:52 -0500 Subject: [PATCH 155/271] bug fixes: console/cli handling --- transport/providers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transport/providers.py b/transport/providers.py index cf5ed59..a7de3d9 100644 --- a/transport/providers.py +++ b/transport/providers.py @@ -54,7 +54,8 @@ READ = {'sql':sql.SQLReader,'nosql':{MONGODB:mongo.MongoReader,COUCHDB:couch.Cou 'file':disk.DiskReader,'queue':{RABBIT:queue.QueueReader,QLISTENER:qlistener.qListener} } WRITE = {'sql':sql.SQLWriter,'nosql':{MONGODB:mongo.MongoWriter,COUCHDB:couch.CouchWriter},'cloud':sql.BigQueryWriter, - 'file':disk.DiskWriter,'queue':{RABBIT:queue.QueueWriter,QLISTENER:qlistener.qListener} + 'file':disk.DiskWriter,'queue':{RABBIT:queue.QueueWriter,QLISTENER:qlistener.qListener},'cli':{CONSOLE:Console},'memory':{CONSOLE:Console} + } DEFAULT = {PG:{'host':'localhost','port':5432},MYSQL:{'host':'localhost','port':3306}} DEFAULT[MONGODB] = {'port':27017,'host':'localhost'} From 1ebe2535954af826e4ce7e0742b616f12c8d8826 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 8 Aug 2023 10:29:55 -0500 Subject: [PATCH 156/271] bug fix --- transport/providers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transport/providers.py b/transport/providers.py index a7de3d9..4fe4784 100644 --- a/transport/providers.py +++ b/transport/providers.py @@ -51,7 +51,8 @@ CATEGORIES ={'sql':[NETEZZA,PG,MYSQL,REDSHIFT,SQLITE,MARIADB],'nosql':[MONGODB,C 'queue':[RABBIT,QLISTENER],'memory':[CONSOLE,QLISTENER],'http':[HTTP]} READ = {'sql':sql.SQLReader,'nosql':{MONGODB:mongo.MongoReader,COUCHDB:couch.CouchReader},'cloud':sql.BigQueryReader, - 'file':disk.DiskReader,'queue':{RABBIT:queue.QueueReader,QLISTENER:qlistener.qListener} + 'file':disk.DiskReader,'queue':{RABBIT:queue.QueueReader,QLISTENER:qlistener.qListener}, + 'cli':{CONSOLE:Console},'memory':{CONSOLE:Console} } WRITE = {'sql':sql.SQLWriter,'nosql':{MONGODB:mongo.MongoWriter,COUCHDB:couch.CouchWriter},'cloud':sql.BigQueryWriter, 'file':disk.DiskWriter,'queue':{RABBIT:queue.QueueWriter,QLISTENER:qlistener.qListener},'cli':{CONSOLE:Console},'memory':{CONSOLE:Console} From 7b25e392b0c54b635377b0d3ce5a4e0c9e04563c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 15 Aug 2023 15:27:19 -0500 Subject: [PATCH 157/271] bug fixes --- transport/sql.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/transport/sql.py b/transport/sql.py index ffabb54..da412fa 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -406,7 +406,9 @@ class BigQuery: if table : _dataset = self.dataset if 'dataset' not in _args else _args['dataset'] sql = f"""SELECT column_name as name, data_type as type FROM {_dataset}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{table}' """ - return self.read(sql=sql).to_dict(orient='records') + _info = {'credentials':self.credentials,'dialect':'standard'} + return pd.read_gbq(sql,**_info).to_dict(orient='records') + # return self.read(sql=sql).to_dict(orient='records') # ref = self.client.dataset(self.dataset).table(table) # _schema = self.client.get_table(ref).schema From 0da32069e2ca5245b13d9a70159c95f0dfd53774 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 8 Sep 2023 11:28:35 -0500 Subject: [PATCH 158/271] databricks support --- transport/__init__.py | 15 +++++- transport/bricks.py | 111 +++++++++++++++++++++++++++++++++++++++++ transport/providers.py | 11 ++-- transport/sql.py | 4 +- transport/version.py | 2 +- 5 files changed, 135 insertions(+), 8 deletions(-) create mode 100644 transport/bricks.py diff --git a/transport/__init__.py b/transport/__init__.py index 8a45800..4c2270c 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -157,20 +157,33 @@ class factory : return anObject import time -def instance(**_args): +def instance(**_pargs): """ creating an instance given the provider, we should have an idea of :class, :driver :provider :read|write = {connection to the database} """ + # + # @TODO: provide authentication file that will hold all the parameters, that will later on be used + # + _args = dict(_pargs,**{}) + if 'auth_file' in _args : + path = _args['auth_file'] + file = open(path) + _config = json.loads( file.read()) + _args = dict(_args,**_config) + file.close() + _provider = _args['provider'] _group = None + for _id in providers.CATEGORIES : if _provider in providers.CATEGORIES[_id] : _group = _id break if _group : + _classPointer = _getClassInstance(_group,**_args) # # Let us reformat the arguments diff --git a/transport/bricks.py b/transport/bricks.py new file mode 100644 index 0000000..0aa4383 --- /dev/null +++ b/transport/bricks.py @@ -0,0 +1,111 @@ +""" +This file implements databricks handling, This functionality will rely on databricks-sql-connector +LICENSE (MIT) +Copyright 2016-2020, The Phi Technology LLC + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +@TODO: + - Migrate SQLite to SQL hierarchy + - Include Write in Chunks from pandas +""" +import os +import sqlalchemy +from transport.common import Reader,Writer +import pandas as pd + + +class Bricks: + """ + :host + :token + :database + :cluster_path + :table + """ + def __init__(self,**_args): + _host = _args['host'] + _token= _args['token'] + _cluster_path = _args['cluster_path'] + self._schema = _args['schema'] if 'schema' in _args else _args['database'] + _catalog = _args['catalog'] + self._table = _args['table'] if 'table' in _args else None + + # + # @TODO: + # Sometimes when the cluster isn't up and running it takes a while, the user should be alerted of this + # + + _uri = f'''databricks://token:{_token}@{_host}?http_path={_cluster_path}&catalog={_catalog}&schema={self._schema}''' + self._engine = sqlalchemy.create_engine (_uri) + pass + def meta(self,**_args): + table = _args['table'] if 'table' in _args else self._table + if not table : + return [] + else: + if sqlalchemy.__version__.startswith('1.') : + _m = sqlalchemy.MetaData(bind=self._engine) + _m.reflect(only=[table]) + else: + _m = sqlalchemy.MetaData() + _m.reflect(bind=self._engine) + # + # Let's retrieve te information associated with a table + # + return [{'name':_attr.name,'type':_attr.type} for _attr in _m.tables[table].columns] + + def has(self,**_args): + return self.meta(**_args) + def apply(self,_sql): + try: + if _sql.lower().startswith('select') : + return pd.read_sql(_sql,self._engine) + except Exception as e: + pass + +class BricksReader(Bricks,Reader): + """ + This class is designed for reads and will execute reads against a table name or a select SQL statement + """ + def __init__(self,**_args): + super().__init__(**_args) + def read(self,**_args): + limit = None if 'limit' not in _args else str(_args['limit']) + + if 'sql' in _args : + sql = _args['sql'] + elif 'table' in _args : + table = _args['table'] + sql = f'SELECT * FROM {table}' + if limit : + sql = sql + f' LIMIT {limit}' + + if 'sql' in _args or 'table' in _args : + return self.apply(sql) + else: + return pd.DataFrame() + pass +class BricksWriter(Bricks,Writer): + def __init__(self,**_args): + super().__init__(**_args) + def write(self,_data,**_args): + """ + This data will write data to data-bricks against a given table. If the table is not specified upon initiazation, it can be specified here + _data: data frame to push to databricks + _args: chunks, table, schema + """ + _schema = self._schema if 'schema' not in _args else _args['schema'] + _table = self._table if 'table' not in _args else _args['table'] + _df = _data if type(_data) == pd.DataFrame else _data + if type(_df) == dict : + _df = [_df] + if type(_df) == list : + _df = pd.DataFrame(_df) + _df.to_sql( + name=_table,schema=_schema, + con=self._engine,if_exists='append',index=False); + pass diff --git a/transport/providers.py b/transport/providers.py index 4fe4784..a638a89 100644 --- a/transport/providers.py +++ b/transport/providers.py @@ -8,6 +8,7 @@ from transport import mongo as mongo from transport import sql as sql from transport import etl as etl from transport import qlistener +from transport import bricks import psycopg2 as pg import mysql.connector as my from google.cloud import bigquery as bq @@ -45,16 +46,18 @@ AWS_S3 = 's3' RABBIT = RABBITMQ QLISTENER = 'qlistener' - +DATABRICKS= 'databricks+connector' DRIVERS = {PG:pg,REDSHIFT:pg,MYSQL:my,MARIADB:my,NETEZZA:nz,SQLITE:sqlite3} -CATEGORIES ={'sql':[NETEZZA,PG,MYSQL,REDSHIFT,SQLITE,MARIADB],'nosql':[MONGODB,COUCHDB],'cloud':[BIGQUERY],'file':[FILE], +CATEGORIES ={'sql':[NETEZZA,PG,MYSQL,REDSHIFT,SQLITE,MARIADB],'nosql':[MONGODB,COUCHDB],'cloud':[BIGQUERY,DATABRICKS],'file':[FILE], 'queue':[RABBIT,QLISTENER],'memory':[CONSOLE,QLISTENER],'http':[HTTP]} -READ = {'sql':sql.SQLReader,'nosql':{MONGODB:mongo.MongoReader,COUCHDB:couch.CouchReader},'cloud':sql.BigQueryReader, +READ = {'sql':sql.SQLReader,'nosql':{MONGODB:mongo.MongoReader,COUCHDB:couch.CouchReader}, + 'cloud':{BIGQUERY:sql.BigQueryReader,DATABRICKS:bricks.BricksReader}, 'file':disk.DiskReader,'queue':{RABBIT:queue.QueueReader,QLISTENER:qlistener.qListener}, 'cli':{CONSOLE:Console},'memory':{CONSOLE:Console} } -WRITE = {'sql':sql.SQLWriter,'nosql':{MONGODB:mongo.MongoWriter,COUCHDB:couch.CouchWriter},'cloud':sql.BigQueryWriter, +WRITE = {'sql':sql.SQLWriter,'nosql':{MONGODB:mongo.MongoWriter,COUCHDB:couch.CouchWriter}, + 'cloud':{BIGQUERY:sql.BigQueryWriter,DATABRICKS:bricks.BricksWriter}, 'file':disk.DiskWriter,'queue':{RABBIT:queue.QueueWriter,QLISTENER:qlistener.qListener},'cli':{CONSOLE:Console},'memory':{CONSOLE:Console} } diff --git a/transport/sql.py b/transport/sql.py index da412fa..3c555f5 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -431,8 +431,8 @@ class BQReader(BigQuery,Reader) : super().__init__(**_args) def apply(self,sql): - self.read(sql=sql) - pass + return self.read(sql=sql) + def read(self,**_args): SQL = None table = self.table if 'table' not in _args else _args['table'] diff --git a/transport/version.py b/transport/version.py index 9db71e5..6d0f952 100644 --- a/transport/version.py +++ b/transport/version.py @@ -1,2 +1,2 @@ __author__ = 'The Phi Technology' -__version__= '1.8.2' +__version__= '1.8.4' From 324d81bd167d89c299834f454ff60412dda56af6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 18 Sep 2023 20:00:40 -0500 Subject: [PATCH 159/271] bug fix with mysql --- transport/__init__.py | 1 - transport/mongo.py | 10 ++++++++-- transport/providers.py | 2 +- transport/version.py | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/transport/__init__.py b/transport/__init__.py index 4c2270c..bbb2e50 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -242,7 +242,6 @@ def _get_alchemyEngine(**_args): uri = ''.join([_provider,"://",_account,_fhost,'/',_database]) - _engine = sqlalchemy.create_engine (uri,future=True) _out = {'sqlalchemy':_engine} _pargs = {'host':_host,'port':_port,'username':_username,'password':_password} diff --git a/transport/mongo.py b/transport/mongo.py index 96c9075..c24b4b8 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -95,10 +95,16 @@ class MongoReader(Mongo,Reader): Mongo.__init__(self,**args) def read(self,**args): - if 'mongo' in args or 'cmd' in args: + if 'mongo' in args or 'cmd' in args or 'pipeline' in args: # # @TODO: - cmd = args['mongo'] if 'mongo' in args else args['cmd'] + cmd = {} + if 'pipeline' in args : + cmd['pipeline']= args['pipeline'] + if 'aggregate' not in cmd : + cmd['aggregate'] = self.collection + if 'pipeline' not in args or 'aggregate' not in cmd : + cmd = args['mongo'] if 'mongo' in args else args['cmd'] if "aggregate" in cmd : if "allowDiskUse" not in cmd : cmd["allowDiskUse"] = True diff --git a/transport/providers.py b/transport/providers.py index a638a89..c1c4bae 100644 --- a/transport/providers.py +++ b/transport/providers.py @@ -27,7 +27,7 @@ SQLITE = 'sqlite' SQLITE3= 'sqlite' REDSHIFT = 'redshift' NETEZZA = 'netezza' -MYSQL = 'mysql' +MYSQL = 'mysql+mysqlconnector' RABBITMQ = 'rabbitmq' MARIADB = 'mariadb' COUCHDB = 'couch' diff --git a/transport/version.py b/transport/version.py index 6d0f952..ec087c4 100644 --- a/transport/version.py +++ b/transport/version.py @@ -1,2 +1,2 @@ __author__ = 'The Phi Technology' -__version__= '1.8.4' +__version__= '1.8.6' From 3f7f3d7306f6339f7bb945936af522ea69017296 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 29 Sep 2023 20:27:53 -0500 Subject: [PATCH 160/271] refactor: factory, etl, fixes: session --- bin/transport | 178 +++++++++++------ setup.py | 2 +- transport/__init__.py | 113 ++++++----- transport/common.py | 46 ++--- transport/etl.py | 435 +++++++++++++++++++++++++---------------- transport/providers.py | 33 +++- transport/qlistener.py | 5 + transport/session.py | 100 ++++++---- transport/sql.py | 22 +-- transport/version.py | 2 +- 10 files changed, 593 insertions(+), 343 deletions(-) diff --git a/bin/transport b/bin/transport index 8edaecc..2225f3b 100755 --- a/bin/transport +++ b/bin/transport @@ -14,19 +14,27 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI Usage : - transport --config --procs -@TODO: Create tables if they don't exist for relational databases -example of configuration : - -1. Move data from a folder to a data-store - transport [--folder ] --config #-- assuming the configuration doesn't have folder - transport --folder --provider -- --table|doc -In this case the configuration should look like : - {folder:..., target:{}} -2. Move data from one source to another - transport --config - {source:{..},target:{..}} or [{source:{..},target:{..}},{source:{..},target:{..}}] - + transport help -- will print this page + + transport move [index] + path to the configuration file + optional index within the configuration file + +e.g: configuration file (JSON formatted) + - single source to a single target + + {"source":{"provider":"http","url":"https://cdn.wsform.com/wp-content/uploads/2020/06/agreement.csv"} + "target":{"provider":"sqlite3","path":"transport-demo.sqlite","table":"agreement"} + } + + - single source to multiple targets + { + "source":{"provider":"http","url":"https://cdn.wsform.com/wp-content/uploads/2020/06/agreement.csv"}, + "target":[ + {"provider":"sqlite3","path":"transport-demo.sqlite","table":"agreement}, + {"provider":"mongodb","db":"transport-demo","collection":"agreement"} + ] + } """ import pandas as pd @@ -36,51 +44,111 @@ import sys import transport import time from multiprocessing import Process -SYS_ARGS = {} -if len(sys.argv) > 1: +import typer +import os +from transport import etl +from transport import providers + +# SYS_ARGS = {} +# if len(sys.argv) > 1: - N = len(sys.argv) - for i in range(1,N): - value = None - if sys.argv[i].startswith('--'): - key = sys.argv[i][2:] #.replace('-','') - SYS_ARGS[key] = 1 - if i + 1 < N: - value = sys.argv[i + 1] = sys.argv[i+1].strip() - if key and value and not value.startswith('--'): - SYS_ARGS[key] = value +# N = len(sys.argv) +# for i in range(1,N): +# value = None +# if sys.argv[i].startswith('--'): +# key = sys.argv[i][2:] #.replace('-','') +# SYS_ARGS[key] = 1 +# if i + 1 < N: +# value = sys.argv[i + 1] = sys.argv[i+1].strip() +# if key and value and not value.startswith('--'): +# SYS_ARGS[key] = value - i += 2 - -if __name__ == '__main__' : - # - # Load information from the file ... - if 'help' in SYS_ARGS : - print (__doc__) - else: - try: - _info = json.loads(open(SYS_ARGS['config']).read()) - if 'index' in SYS_ARGS : - _index = int(SYS_ARGS['index']) - _info = [_item for _item in _info if _info.index(_item) == _index] - pass - elif 'id' in SYS_ARGS : - _info = [_item for _item in _info if 'id' in _item and _item['id'] == SYS_ARGS['id']] +# i += 2 + +app = typer.Typer() + +# @app.command() +def help() : + print (__doc__) +def wait(jobs): + while jobs : + jobs = [thread for thread in jobs if thread.is_alive()] + time.sleep(1) + +@app.command() +def move (path,index=None): + + _proxy = lambda _object: _object.write(_object.read()) + if os.path.exists(path): + file = open(path) + _config = json.loads (file.read() ) + file.close() + if index : + _config = _config[ int(index)] + etl.instance(**_config) + else: + etl.instance(_config) + + # + # if type(_config) == dict : + # _object = transport.etl.instance(**_config) + # _proxy(_object) + # else: + # # + # # here we are dealing with a list of objects (long ass etl job) + # jobs = [] + # failed = [] + # for _args in _config : + # if index and _config.index(_args) != index : + # continue + + # _object=transport.etl.instance(**_args) + # thread = Process(target=_proxy,args=(_object,)) + # thread.start() + # jobs.append(thread()) + # if _config.index(_args) == 0 : + # thread.join() + wait(jobs) + +@app.command() +def generate (path:str): + __doc__=""" + + """ + _config = [{"source":{"provider":"http","url":"https://cdn.wsform.com/wp-content/uploads/2020/06/agreement.csv"},"target":{"provider":"file","path":"addresses.csv","delimiter":"csv"}}] + file = open(path,'w') + file.write(json.dumps(_config)) + file.close() + +# if __name__ == '__main__' : +# # +# # Load information from the file ... +# if 'help' in SYS_ARGS : +# print (__doc__) +# else: +# try: +# _info = json.loads(open(SYS_ARGS['config']).read()) +# if 'index' in SYS_ARGS : +# _index = int(SYS_ARGS['index']) +# _info = [_item for _item in _info if _info.index(_item) == _index] +# pass +# elif 'id' in SYS_ARGS : +# _info = [_item for _item in _info if 'id' in _item and _item['id'] == SYS_ARGS['id']] - procs = 1 if 'procs' not in SYS_ARGS else int(SYS_ARGS['procs']) - jobs = transport.factory.instance(provider='etl',info=_info,procs=procs) - print ([len(jobs),' Jobs are running']) - N = len(jobs) - while jobs : - x = len(jobs) - jobs = [_job for _job in jobs if _job.is_alive()] - if x != len(jobs) : - print ([len(jobs),'... jobs still running']) - time.sleep(1) - print ([N,' Finished running']) - except Exception as e: +# procs = 1 if 'procs' not in SYS_ARGS else int(SYS_ARGS['procs']) +# jobs = transport.factory.instance(provider='etl',info=_info,procs=procs) +# print ([len(jobs),' Jobs are running']) +# N = len(jobs) +# while jobs : +# x = len(jobs) +# jobs = [_job for _job in jobs if _job.is_alive()] +# if x != len(jobs) : +# print ([len(jobs),'... jobs still running']) +# time.sleep(1) +# print ([N,' Finished running']) +# except Exception as e: - print (e) +# print (e) - \ No newline at end of file + diff --git a/setup.py b/setup.py index 7eff1e4..254bb5c 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ args = { "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pymongo','sqlalchemy<2.0.0','pandas','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] +args["install_requires"] = ['pymongo','sqlalchemy<2.0.0','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] if sys.version_info[0] == 2 : diff --git a/transport/__init__.py b/transport/__init__.py index bbb2e50..e139aa5 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -28,7 +28,7 @@ import importlib import sys import sqlalchemy if sys.version_info[0] > 2 : - from transport.common import Reader, Writer,Console #, factory + # from transport.common import Reader, Writer,Console #, factory from transport import disk from transport import s3 as s3 @@ -97,7 +97,7 @@ class factory : TYPE = {"sql":{"providers":["postgresql","mysql","neteeza","bigquery","mariadb","redshift"]}} PROVIDERS = { "etl":{"class":{"read":etl.instance,"write":etl.instance}}, - "console":{"class":{"write":Console,"read":Console}}, + # "console":{"class":{"write":Console,"read":Console}}, "file":{"class":{"read":disk.DiskReader,"write":disk.DiskWriter}}, "sqlite":{"class":{"read":disk.SQLiteReader,"write":disk.SQLiteWriter}}, "postgresql":{"port":5432,"host":"localhost","database":None,"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, @@ -124,6 +124,9 @@ class factory : # # Legacy code being returned return factory._instance(**_args); + + + else: return instance(**_args) @staticmethod @@ -175,22 +178,31 @@ def instance(**_pargs): file.close() _provider = _args['provider'] - _group = None + _context = list( set(['read','write','listen']) & set(_args.keys()) ) + if _context : + _context = _context[0] + else: + _context = _args['context'] if 'context' in _args else 'read' + # _group = None - for _id in providers.CATEGORIES : - if _provider in providers.CATEGORIES[_id] : - _group = _id - break - if _group : + # for _id in providers.CATEGORIES : + # if _provider in providers.CATEGORIES[_id] : + # _group = _id + # break + # if _group : + + if _provider in providers.PROVIDERS and _context in providers.PROVIDERS[_provider]: - _classPointer = _getClassInstance(_group,**_args) + # _classPointer = _getClassInstance(_group,**_args) + _classPointer = providers.PROVIDERS[_provider][_context] # # Let us reformat the arguments - if 'read' in _args or 'write' in _args : - _args = _args['read'] if 'read' in _args else _args['write'] - _args['provider'] = _provider - if _group == 'sql' : + # if 'read' in _args or 'write' in _args : + # _args = _args['read'] if 'read' in _args else _args['write'] + # _args['provider'] = _provider + # if _group == 'sql' : + if _provider in providers.CATEGORIES['sql'] : _info = _get_alchemyEngine(**_args) _args = dict(_args,**_info) @@ -215,57 +227,68 @@ def _get_alchemyEngine(**_args): This function returns the SQLAlchemy engine associated with parameters, This is only applicable for SQL _items :_args arguments passed to the factory {provider and other} """ - #@TODO: Enable authentication files (private_key) - _username = _args['username'] if 'username' in _args else '' - _password = _args['password'] if 'password' in _args else '' - _account = _args['account'] if 'account' in _args else '' - _database = _args['database'] _provider = _args['provider'] - if _username != '': - _account = _username + ':'+_password+'@' - _host = _args['host'] if 'host' in _args else '' - _port = _args['port'] if 'port' in _args else '' - if _provider in providers.DEFAULT : - _default = providers.DEFAULT[_provider] - _host = _host if _host != '' else (_default['host'] if 'host' in _default else '') - _port = _port if _port != '' else (_default['port'] if 'port' in _default else '') - if _port == '': - _port = providers.DEFAULT['port'] if 'port' in providers.DEFAULT else '' - # - - if _host != '' and _port != '' : - _fhost = _host+":"+str(_port) #--formatted hostname + _pargs = {} + if _provider == providers.SQLITE3 : + _path = _args['database'] if 'database' in _args else _args['path'] + uri = ''.join([_provider,':///',_path]) + else: - _fhost = _host - # Let us update the parameters we have thus far + + #@TODO: Enable authentication files (private_key) + _username = _args['username'] if 'username' in _args else '' + _password = _args['password'] if 'password' in _args else '' + _account = _args['account'] if 'account' in _args else '' + _database = _args['database'] if 'database' in _args else _args['path'] + + if _username != '': + _account = _username + ':'+_password+'@' + _host = _args['host'] if 'host' in _args else '' + _port = _args['port'] if 'port' in _args else '' + if _provider in providers.DEFAULT : + _default = providers.DEFAULT[_provider] + _host = _host if _host != '' else (_default['host'] if 'host' in _default else '') + _port = _port if _port != '' else (_default['port'] if 'port' in _default else '') + if _port == '': + _port = providers.DEFAULT['port'] if 'port' in providers.DEFAULT else '' + # + + if _host != '' and _port != '' : + _fhost = _host+":"+str(_port) #--formatted hostname + else: + _fhost = _host + # Let us update the parameters we have thus far # - uri = ''.join([_provider,"://",_account,_fhost,'/',_database]) + uri = ''.join([_provider,"://",_account,_fhost,'/',_database]) + _pargs = {'host':_host,'port':_port,'username':_username,'password':_password} _engine = sqlalchemy.create_engine (uri,future=True) _out = {'sqlalchemy':_engine} - _pargs = {'host':_host,'port':_port,'username':_username,'password':_password} + for key in _pargs : if _pargs[key] != '' : _out[key] = _pargs[key] return _out +@DeprecationWarning def _getClassInstance(_group,**_args): """ This function returns the class instance we are attempting to instanciate :_group items in providers.CATEGORIES.keys() :_args arguments passed to the factory class """ - if 'read' in _args or 'write' in _args : - _context = 'read' if 'read' in _args else _args['write'] - _info = _args[_context] - else: - _context = _args['context'] if 'context' in _args else 'read' - _class = providers.READ[_group] if _context == 'read' else providers.WRITE[_group] - if type(_class) == dict and _args['provider'] in _class: - _class = _class[_args['provider']] + # if 'read' in _args or 'write' in _args : + # _context = 'read' if 'read' in _args else _args['write'] + # _info = _args[_context] + # else: + # _context = _args['context'] if 'context' in _args else 'read' + # _class = providers.READ[_group] if _context == 'read' else providers.WRITE[_group] + # if type(_class) == dict and _args['provider'] in _class: + # _class = _class[_args['provider']] - return _class + # return _class +@DeprecationWarning def __instance(**_args): """ diff --git a/transport/common.py b/transport/common.py index 39df6a3..59f57ea 100644 --- a/transport/common.py +++ b/transport/common.py @@ -93,29 +93,29 @@ class ReadWriter(Reader,Writer) : This class implements the read/write functions aggregated """ pass -class Console(Writer): - lock = RLock() - def __init__(self,**_args): - self.lock = _args['lock'] if 'lock' in _args else False - self.info = self.write - self.debug = self.write - self.log = self.write - pass - def write (self,logs=None,**_args): - if self.lock : - Console.lock.acquire() - try: - _params = _args if logs is None and _args else logs - if type(_params) == list: - for row in _params : - print (row) - else: - print (_params) - except Exception as e : - print (e) - finally: - if self.lock : - Console.lock.release() +# class Console(Writer): +# lock = RLock() +# def __init__(self,**_args): +# self.lock = _args['lock'] if 'lock' in _args else False +# self.info = self.write +# self.debug = self.write +# self.log = self.write +# pass +# def write (self,logs=None,**_args): +# if self.lock : +# Console.lock.acquire() +# try: +# _params = _args if logs is None and _args else logs +# if type(_params) == list: +# for row in _params : +# print (row) +# else: +# print (_params) +# except Exception as e : +# print (e) +# finally: +# if self.lock : +# Console.lock.release() """ diff --git a/transport/etl.py b/transport/etl.py index 9d520d4..dac58c4 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -35,6 +35,9 @@ import json import sys import transport import time +import os + + from multiprocessing import Process SYS_ARGS = {} if len(sys.argv) > 1: @@ -52,199 +55,301 @@ if len(sys.argv) > 1: i += 2 - -class Post(Process): - def __init__(self,**args): +class Transporter(Process): + """ + The transporter (Jason Stathem) moves data from one persistant store to another + - callback functions + :onFinish callback function when finished + :onError callback function when an error occurs + :source source data specification + :target destination(s) to move the data to + """ + def __init__(self,**_args): super().__init__() - self.store = args['target'] - if 'provider' not in args['target'] : - pass - self.PROVIDER = args['target']['type'] - # self.writer = transport.factory.instance(**args['target']) - else: - self.PROVIDER = args['target']['provider'] - self.store['context'] = 'write' - # self.store = args['target'] - self.store['lock'] = True - # self.writer = transport.instance(**args['target']) + # self.onfinish = _args['onFinish'] + # self._onerror = _args['onError'] + self._source = _args['source'] + self._target = _args['target'] + # - # If the table doesn't exists maybe create it ? + # Let's insure we can support multiple targets + self._target = [self._target] if type(self._target) != list else self._target + + pass + def read(self,**_args): + """ + This function + """ + _reader = transport.factory.instance(**self._source) # - self.rows = args['rows'] - # self.rows = args['rows'].fillna('') + # If arguments are provided then a query is to be executed (not just a table dump) + return _reader.read() if 'args' not in self._source else _reader.read(**self._source['args']) + + def _delegate_write(self,_data,**_args): + """ + This function will write a data-frame to a designated data-store, The function is built around a delegation design pattern + :data data-frame or object to be written + """ + for _target in self._target : + if 'write' not in _target : + _target['context'] = 'write' + _target['lock'] = True + else: + _target['write']['lock'] = True + _writer = transport.factory.instance(**_target) + _writer.write(_data,**_args) + if hasattr(_writer,'close') : + _writer.close() + + def write(self,_df,**_args): + """ + """ + SEGMENT_COUNT = 6 + MAX_ROWS = 1000000 + # _df = self.read() + _segments = np.array_split(np.range(_df.shape[0]),SEGMENT_COUNT) if _df.shape[0] > MAX_ROWS else np.array( [np.arange(_df.shape[0])]) + # _index = 0 + + + for _indexes in _segments : + _fwd_args = {} if not _args else _args + + self._delegate_write(_df.iloc[_indexes],**_fwd_args) + # + # @TODO: Perhaps consider writing up each segment in a thread/process (speeds things up?) + pass + +def instance(**_args): + _proxy = lambda _agent: _agent.write(_agent.read()) + if 'source' in _args and 'target' in _args : + + _agent = Transporter(**_args) + _proxy(_agent) + + else: + _config = _args['config'] + _items = [Transporter(**_item) for _item in _config ] + _MAX_JOBS = 5 + _items = np.array_split(_items,_MAX_JOBS) + for _batch in _items : + jobs = [] + for _item in _batch : + thread = Process(target=_proxy,args = (_item,)) + thread.start() + jobs.append(thread) + while jobs : + jobs = [thread for thread in jobs if thread.is_alive()] + time.sleep(1) + + pass +# class Post(Process): +# def __init__(self,**args): +# super().__init__() +# self.store = args['target'] +# if 'provider' not in args['target'] : +# pass +# self.PROVIDER = args['target']['type'] +# # self.writer = transport.factory.instance(**args['target']) +# else: +# self.PROVIDER = args['target']['provider'] +# self.store['context'] = 'write' +# # self.store = args['target'] +# self.store['lock'] = True +# # self.writer = transport.instance(**args['target']) +# # +# # If the table doesn't exists maybe create it ? +# # +# self.rows = args['rows'] +# # self.rows = args['rows'].fillna('') - def log(self,**_args) : - if ETL.logger : - ETL.logger.info(**_args) +# def log(self,**_args) : +# if ETL.logger : +# ETL.logger.info(**_args) - def run(self): - _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows +# def run(self): +# _info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows - writer = transport.factory.instance(**self.store) - writer.write(_info) - writer.close() +# writer = transport.factory.instance(**self.store) +# writer.write(_info) +# writer.close() -class ETL (Process): - logger = None - def __init__(self,**_args): - super().__init__() +# class ETL (Process): +# logger = None +# def __init__(self,**_args): +# super().__init__() - self.name = _args['id'] if 'id' in _args else 'UNREGISTERED' - # if 'provider' not in _args['source'] : - # #@deprecate - # self.reader = transport.factory.instance(**_args['source']) - # else: - # # - # # This is the new interface - # _args['source']['context'] = 'read' +# self.name = _args['id'] if 'id' in _args else 'UNREGISTERED' +# # if 'provider' not in _args['source'] : +# # #@deprecate +# # self.reader = transport.factory.instance(**_args['source']) +# # else: +# # # +# # # This is the new interface +# # _args['source']['context'] = 'read' - # self.reader = transport.instance(**_args['source']) +# # self.reader = transport.instance(**_args['source']) - # - # do we have an sql query provided or not .... - # self.sql = _args['source']['sql'] if 'sql' in _args['source'] else None - # self.cmd = _args['source']['cmd'] if 'cmd' in _args['source'] else None - # self._oargs = _args['target'] #transport.factory.instance(**_args['target']) - self._source = _args ['source'] - self._target = _args['target'] - self._source['context'] = 'read' - self._target['context'] = 'write' +# # +# # do we have an sql query provided or not .... +# # self.sql = _args['source']['sql'] if 'sql' in _args['source'] else None +# # self.cmd = _args['source']['cmd'] if 'cmd' in _args['source'] else None +# # self._oargs = _args['target'] #transport.factory.instance(**_args['target']) +# self._source = _args ['source'] +# self._target = _args['target'] +# self._source['context'] = 'read' +# self._target['context'] = 'write' - self.JOB_COUNT = _args['jobs'] - self.jobs = [] - # self.logger = transport.factory.instance(**_args['logger']) - def log(self,**_args) : - if ETL.logger : - ETL.logger.info(**_args) +# self.JOB_COUNT = _args['jobs'] +# self.jobs = [] +# # self.logger = transport.factory.instance(**_args['logger']) +# def log(self,**_args) : +# if ETL.logger : +# ETL.logger.info(**_args) - def run(self): - # if self.cmd : - # idf = self.reader.read(**self.cmd) - # else: - # idf = self.reader.read() - # idf = pd.DataFrame(idf) - # # idf = idf.replace({np.nan: None}, inplace = True) +# def run(self): +# # if self.cmd : +# # idf = self.reader.read(**self.cmd) +# # else: +# # idf = self.reader.read() +# # idf = pd.DataFrame(idf) +# # # idf = idf.replace({np.nan: None}, inplace = True) - # idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()] - # self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT) +# # idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()] +# # self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT) - # - # writing the data to a designated data source - # - try: +# # +# # writing the data to a designated data source +# # +# try: - _log = {"name":self.name,"rows":{"input":0,"output":0}} - _reader = transport.factory.instance(**self._source) - if 'table' in self._source : - _df = _reader.read() - else: - _df = _reader.read(**self._source['cmd']) - _log['rows']['input'] = _df.shape[0] - # - # Let's write the input data-frame to the target ... - _writer = transport.factory.instance(**self._target) - _writer.write(_df) - _log['rows']['output'] = _df.shape[0] +# _log = {"name":self.name,"rows":{"input":0,"output":0}} +# _reader = transport.factory.instance(**self._source) +# if 'table' in self._source : +# _df = _reader.read() +# else: +# _df = _reader.read(**self._source['cmd']) +# _log['rows']['input'] = _df.shape[0] +# # +# # Let's write the input data-frame to the target ... +# _writer = transport.factory.instance(**self._target) +# _writer.write(_df) +# _log['rows']['output'] = _df.shape[0] - # self.log(module='write',action='partitioning',jobs=self.JOB_COUNT) - # rows = np.array_split(np.arange(0,idf.shape[0]),self.JOB_COUNT) +# # self.log(module='write',action='partitioning',jobs=self.JOB_COUNT) +# # rows = np.array_split(np.arange(0,idf.shape[0]),self.JOB_COUNT) - # # - # # @TODO: locks - # for i in np.arange(self.JOB_COUNT) : - # # _id = ' '.join([str(i),' table ',self.name]) - # indexes = rows[i] - # segment = idf.loc[indexes,:].copy() #.to_dict(orient='records') - # _name = "partition-"+str(i) - # if segment.shape[0] == 0 : - # continue +# # # +# # # @TODO: locks +# # for i in np.arange(self.JOB_COUNT) : +# # # _id = ' '.join([str(i),' table ',self.name]) +# # indexes = rows[i] +# # segment = idf.loc[indexes,:].copy() #.to_dict(orient='records') +# # _name = "partition-"+str(i) +# # if segment.shape[0] == 0 : +# # continue - # proc = Post(target = self._oargs,rows = segment,name=_name) - # self.jobs.append(proc) - # proc.start() +# # proc = Post(target = self._oargs,rows = segment,name=_name) +# # self.jobs.append(proc) +# # proc.start() - # self.log(module='write',action='working',segment=str(self.name),table=self.name,rows=segment.shape[0]) - # while self.jobs : - # jobs = [job for job in proc if job.is_alive()] - # time.sleep(1) - except Exception as e: - print (e) - self.log(**_log) - def is_done(self): - self.jobs = [proc for proc in self.jobs if proc.is_alive()] - return len(self.jobs) == 0 -def instance(**_args): - """ - :path ,index, id - :param _info list of objects with {source,target}` - :param logger - """ - logger = _args['logger'] if 'logger' in _args else None - if 'path' in _args : - _info = json.loads((open(_args['path'])).read()) +# # self.log(module='write',action='working',segment=str(self.name),table=self.name,rows=segment.shape[0]) +# # while self.jobs : +# # jobs = [job for job in proc if job.is_alive()] +# # time.sleep(1) +# except Exception as e: +# print (e) +# self.log(**_log) +# def is_done(self): +# self.jobs = [proc for proc in self.jobs if proc.is_alive()] +# return len(self.jobs) == 0 + + +# def instance (**_args): +# """ +# path to configuration file +# """ +# _path = _args['path'] +# _config = {} +# jobs = [] +# if os.path.exists(_path) : +# file = open(_path) +# _config = json.loads(file.read()) +# file.close() +# if _config and type + + +# def _instance(**_args): +# """ +# :path ,index, id +# :param _info list of objects with {source,target}` +# :param logger +# """ +# logger = _args['logger'] if 'logger' in _args else None +# if 'path' in _args : +# _info = json.loads((open(_args['path'])).read()) - if 'index' in _args : - _index = int(_args['index']) - _info = _info[_index] +# if 'index' in _args : +# _index = int(_args['index']) +# _info = _info[_index] - elif 'id' in _args : - _info = [_item for _item in _info if '_id' in _item and _item['id'] == _args['id']] - _info = _info[0] if _info else _info - else: - _info = _args['info'] +# elif 'id' in _args : +# _info = [_item for _item in _info if '_id' in _item and _item['id'] == _args['id']] +# _info = _info[0] if _info else _info +# else: +# _info = _args['info'] - if logger and type(logger) != str: - ETL.logger = logger - elif logger == 'console': - ETL.logger = transport.factory.instance(provider='console',context='write',lock=True) - if type(_info) in [list,dict] : - _info = _info if type(_info) != dict else [_info] - # - # The assumption here is that the objects within the list are {source,target} - jobs = [] - for _item in _info : +# if logger and type(logger) != str: +# ETL.logger = logger +# elif logger == 'console': +# ETL.logger = transport.factory.instance(provider='console',context='write',lock=True) +# if type(_info) in [list,dict] : +# _info = _info if type(_info) != dict else [_info] +# # +# # The assumption here is that the objects within the list are {source,target} +# jobs = [] +# for _item in _info : - _item['jobs'] = 5 if 'procs' not in _args else int(_args['procs']) - _job = ETL(**_item) +# _item['jobs'] = 5 if 'procs' not in _args else int(_args['procs']) +# _job = ETL(**_item) - _job.start() - jobs.append(_job) - return jobs +# _job.start() +# jobs.append(_job) +# return jobs - else: - return None - -if __name__ == '__main__' : - _info = json.loads(open (SYS_ARGS['config']).read()) - index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else None - procs = [] - for _config in _info : - if 'source' in SYS_ARGS : - _config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}} - - _config['jobs'] = 3 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs']) - etl = ETL (**_config) - if index is None: +# else: +# return None + +# if __name__ == '__main__' : +# _info = json.loads(open (SYS_ARGS['config']).read()) +# index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else None +# procs = [] +# for _config in _info : +# if 'source' in SYS_ARGS : +# _config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}} + +# _config['jobs'] = 3 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs']) +# etl = ETL (**_config) +# if index is None: - etl.start() - procs.append(etl) +# etl.start() +# procs.append(etl) - elif _info.index(_config) == index : +# elif _info.index(_config) == index : - # print (_config) - procs = [etl] - etl.start() - break - # - # - N = len(procs) - while procs : - procs = [thread for thread in procs if not thread.is_done()] - if len(procs) < N : - print (["Finished ",(N-len(procs)), " remaining ", len(procs)]) - N = len(procs) - time.sleep(1) - # print ("We're done !!") \ No newline at end of file +# # print (_config) +# procs = [etl] +# etl.start() +# break +# # +# # +# N = len(procs) +# while procs : +# procs = [thread for thread in procs if not thread.is_done()] +# if len(procs) < N : +# print (["Finished ",(N-len(procs)), " remaining ", len(procs)]) +# N = len(procs) +# time.sleep(1) +# # print ("We're done !!") \ No newline at end of file diff --git a/transport/providers.py b/transport/providers.py index c1c4bae..fc394f3 100644 --- a/transport/providers.py +++ b/transport/providers.py @@ -1,4 +1,4 @@ -from transport.common import Reader, Writer,Console #, factory +# from transport.common import Reader, Writer,Console #, factory from transport import disk import sqlite3 from transport import s3 as s3 @@ -9,6 +9,7 @@ from transport import sql as sql from transport import etl as etl from transport import qlistener from transport import bricks +from transport import session import psycopg2 as pg import mysql.connector as my from google.cloud import bigquery as bq @@ -33,6 +34,8 @@ MARIADB = 'mariadb' COUCHDB = 'couch' CONSOLE = 'console' ETL = 'etl' + + # # synonyms of the above BQ = BIGQUERY @@ -54,13 +57,37 @@ CATEGORIES ={'sql':[NETEZZA,PG,MYSQL,REDSHIFT,SQLITE,MARIADB],'nosql':[MONGODB,C READ = {'sql':sql.SQLReader,'nosql':{MONGODB:mongo.MongoReader,COUCHDB:couch.CouchReader}, 'cloud':{BIGQUERY:sql.BigQueryReader,DATABRICKS:bricks.BricksReader}, 'file':disk.DiskReader,'queue':{RABBIT:queue.QueueReader,QLISTENER:qlistener.qListener}, - 'cli':{CONSOLE:Console},'memory':{CONSOLE:Console} + # 'cli':{CONSOLE:Console},'memory':{CONSOLE:Console},'http':session.HttpReader } WRITE = {'sql':sql.SQLWriter,'nosql':{MONGODB:mongo.MongoWriter,COUCHDB:couch.CouchWriter}, 'cloud':{BIGQUERY:sql.BigQueryWriter,DATABRICKS:bricks.BricksWriter}, - 'file':disk.DiskWriter,'queue':{RABBIT:queue.QueueWriter,QLISTENER:qlistener.qListener},'cli':{CONSOLE:Console},'memory':{CONSOLE:Console} + 'file':disk.DiskWriter,'queue':{RABBIT:queue.QueueWriter,QLISTENER:qlistener.qListener}, + # 'cli':{CONSOLE:Console}, + # 'memory':{CONSOLE:Console}, 'http':session.HttpReader } +# SQL_PROVIDERS = [POSTGRESQL,MYSQL,NETEZZA,MARIADB,SQLITE] +PROVIDERS = { + FILE:{'read':disk.DiskReader,'write':disk.DiskWriter}, + SQLITE:{'read':disk.SQLiteReader,'write':disk.SQLiteWriter,'driver':sqlite3}, + + POSTGRESQL:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':pg,'default':{'host':'localhost','port':5432}}, + NETEZZA:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':nz,'default':{'port':5480}}, + REDSHIFT:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':pg,'default':{'host':'localhost','port':5432}}, + RABBITMQ:{'read':queue.QueueReader,'writer':queue.QueueWriter,'context':queue.QueueListener,'default':{'host':'localhost','port':5432}}, + + MYSQL:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':my,'default':{'host':'localhost','port':3306}}, + MARIADB:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':my,'default':{'host':'localhost','port':3306}}, + S3:{'read':s3.s3Reader,'write':s3.s3Writer}, + BIGQUERY:{'read':sql.BigQueryReader,'write':sql.BigQueryWriter}, + QLISTENER:{'read':qlistener.qListener,'write':qlistener.qListener,'default':{'host':'localhost','port':5672}}, + CONSOLE:{'read':qlistener.Console,"write":qlistener.Console}, + HTTP:{'read':session.HttpReader,'write':session.HttpWriter}, + DATABRICKS:{'read':bricks.BricksReader,'write':bricks.BricksWriter}, + MONGODB:{'read':mongo.MongoReader,'write':mongo.MongoWriter,'default':{'port':27017,'host':'localhost'}}, + COUCHDB:{'read':couch.CouchReader,'writer':couch.CouchWriter,'default':{'host':'localhost','port':5984}}, + ETL :{'read':etl.Transporter,'write':etl.Transporter} +} DEFAULT = {PG:{'host':'localhost','port':5432},MYSQL:{'host':'localhost','port':3306}} DEFAULT[MONGODB] = {'port':27017,'host':'localhost'} DEFAULT[REDSHIFT] = DEFAULT[PG] diff --git a/transport/qlistener.py b/transport/qlistener.py index 495b731..26f0ba8 100644 --- a/transport/qlistener.py +++ b/transport/qlistener.py @@ -40,3 +40,8 @@ class qListener : _q = qListener._queue[_id] _q.put(_data) _q.join() +class Console (qListener): + def __init__(self,**_args): + super().__init__(callback=print) + + # self.callback = print \ No newline at end of file diff --git a/transport/session.py b/transport/session.py index 915d2b5..d74669a 100644 --- a/transport/session.py +++ b/transport/session.py @@ -1,54 +1,60 @@ from flask import request, session from datetime import datetime import re -from common import Reader, Writer +from transport.common import Reader, Writer import json +import requests +from io import StringIO +import pandas as pd -class HttpRequestReader(Reader): + +class HttpReader(Reader): """ This class is designed to read data from an Http request file handler provided to us by flask The file will be heald in memory and processed accordingly NOTE: This is inefficient and can crash a micro-instance (becareful) """ - def __init__(self,**params): - self.file_length = 0 - try: - - #self.file = params['file'] - #self.file.seek(0, os.SEEK_END) - #self.file_length = self.file.tell() - - #print 'size of file ',self.file_length - self.content = params['file'].readlines() - self.file_length = len(self.content) - except Exception as e: - print ("Error ... ",e) - pass + def __init__(self,**_args): + self._url = _args['url'] + self._headers = None if 'headers' not in _args else _args['headers'] + + # def isready(self): + # return self.file_length > 0 + def format(self,_response): + _mimetype= _response.headers['Content-Type'] + if _mimetype == 'text/csv' or 'text/csv': + _content = _response.text + return pd.read_csv(StringIO(_content)) + # + # @TODO: Add support for excel, JSON and other file formats that fit into a data-frame + # - def isready(self): - return self.file_length > 0 - def read(self,size =-1): - i = 1 - for row in self.content: - i += 1 - if size == i: - break - yield row + return _response.text + def read(self,**_args): + if self._headers : + r = requests.get(self._url,headers = self._headers) + else: + r = requests.get(self._url,headers = self._headers) + return self.format(r) -class HttpSessionWriter(Writer): +class HttpWriter(Writer): """ - This class is designed to write data to a session/cookie + This class is designed to submit data to an endpoint (url) """ - def __init__(self,**params): + def __init__(self,**_args): """ @param key required session key """ - self.session = params['queue'] - self.session['sql'] = [] - self.session['csv'] = [] - self.tablename = re.sub('..+$','',params['filename']) - self.session['uid'] = params['uid'] + self._url = _args['url'] + self._name = _args['name'] + self._method = 'post' if 'method' not in _args else _args['method'] + + # self.session = params['queue'] + # self.session['sql'] = [] + # self.session['csv'] = [] + # self.tablename = re.sub('..+$','',params['filename']) + # self.session['uid'] = params['uid'] #self.xchar = params['xchar'] @@ -57,10 +63,26 @@ class HttpSessionWriter(Writer): return "".join(["INSERT INTO :table VALUES('",values,"');\n"]).replace(':table',self.tablename) def isready(self): return True - def write(self,**params): - label = params['label'] - row = params ['row'] + def write(self,_data,**_args): + # + # + _method = self._method if 'method' not in _args else _args['method'] + _method = _method.lower() + _mimetype = 'text/csv' + if type(_data) == dict : + _mimetype = 'application/json' + _content = _data + else: + _content = _data.to_dict(orient='records') + _headers = {'Content-Type':_mimetype} + _pointer = getattr(requests,_method) + + _pointer ({self._name:_content},headers=_headers) + + + # label = params['label'] + # row = params ['row'] - if label == 'usable': - self.session['csv'].append(self.format(row,',')) - self.session['sql'].append(self.format_sql(row)) + # if label == 'usable': + # self.session['csv'].append(self.format(row,',')) + # self.session['sql'].append(self.format_sql(row)) diff --git a/transport/sql.py b/transport/sql.py index 3c555f5..019db78 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -291,17 +291,17 @@ class SQLWriter(SQLRW,Writer): """ # inspect = False if 'inspect' not in _args else _args['inspect'] # cast = False if 'cast' not in _args else _args['cast'] - if not self.fields : - if type(info) == list : - _fields = info[0].keys() - elif type(info) == dict : - _fields = info.keys() - elif type(info) == pd.DataFrame : - _fields = info.columns.tolist() - - # _fields = info.keys() if type(info) == dict else info[0].keys() - _fields = list (_fields) - self.init(_fields) + # if not self.fields : + # if type(info) == list : + # _fields = info[0].keys() + # elif type(info) == dict : + # _fields = info.keys() + # elif type(info) == pd.DataFrame : + # _fields = info.columns.tolist() + + # # _fields = info.keys() if type(info) == dict else info[0].keys() + # # _fields = list (_fields) + # self.init(_fields) try: table = _args['table'] if 'table' in _args else self.table diff --git a/transport/version.py b/transport/version.py index ec087c4..5e7e7b7 100644 --- a/transport/version.py +++ b/transport/version.py @@ -1,2 +1,2 @@ __author__ = 'The Phi Technology' -__version__= '1.8.6' +__version__= '1.9.0' From 2bb07aedec694dc0074bc6f66b0133d29b8d4d0f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 30 Sep 2023 00:18:37 -0500 Subject: [PATCH 161/271] bug fix: sqlite and cursors and transport --- bin/transport | 28 ++++++++++----- transport/disk.py | 87 ++++++++++++++++++++++++----------------------- transport/etl.py | 28 ++++++++------- 3 files changed, 80 insertions(+), 63 deletions(-) diff --git a/bin/transport b/bin/transport index 2225f3b..dd424a2 100755 --- a/bin/transport +++ b/bin/transport @@ -46,6 +46,7 @@ import time from multiprocessing import Process import typer import os +import transport from transport import etl from transport import providers @@ -88,7 +89,7 @@ def move (path,index=None): _config = _config[ int(index)] etl.instance(**_config) else: - etl.instance(_config) + etl.instance(config=_config) # # if type(_config) == dict : @@ -109,19 +110,30 @@ def move (path,index=None): # jobs.append(thread()) # if _config.index(_args) == 0 : # thread.join() - wait(jobs) - + # wait(jobs) +@app.command() +def version(): + print (transport.version.__version__) @app.command() def generate (path:str): - __doc__=""" - """ - _config = [{"source":{"provider":"http","url":"https://cdn.wsform.com/wp-content/uploads/2020/06/agreement.csv"},"target":{"provider":"file","path":"addresses.csv","delimiter":"csv"}}] + This function will generate a configuration template to give a sense of how to create one + """ + _config = [ + { + "source":{"provider":"http","url":"https://raw.githubusercontent.com/codeforamerica/ohana-api/master/data/sample-csv/addresses.csv"}, + "target": + [{"provider":"file","path":"addresses.csv","delimiter":"csv"},{"provider":"sqlite","database":"sample.db3","table":"addresses"}] + } + ] file = open(path,'w') file.write(json.dumps(_config)) file.close() - -# if __name__ == '__main__' : +@app.command() +def usage(): + print (__doc__) +if __name__ == '__main__' : + app() # # # # Load information from the file ... # if 'help' in SYS_ARGS : diff --git a/transport/disk.py b/transport/disk.py index 8514e3f..a3880ec 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -62,34 +62,25 @@ class DiskWriter(Writer): """ THREAD_LOCK = Lock() def __init__(self,**params): - Writer.__init__(self) - self.cache['meta'] = {'cols':0,'rows':0,'delimiter':None} - if 'path' in params: - self.path = params['path'] - else: - self.path = 'data-transport.log' - self.delimiter = params['delimiter'] if 'delimiter' in params else None - # if 'name' in params: - # self.name = params['name']; - # else: - # self.name = 'data-transport.log' - # if os.path.exists(self.path) == False: - # os.mkdir(self.path) - def meta(self): - return self.cache['meta'] - def isready(self): - """ - This function determines if the class is ready for execution or not - i.e it determines if the preconditions of met prior execution - """ - return True - # p = self.path is not None and os.path.exists(self.path) - # q = self.name is not None - # return p and q - def format (self,row): - self.cache['meta']['cols'] += len(row) if isinstance(row,list) else len(row.keys()) - self.cache['meta']['rows'] += 1 - return (self.delimiter.join(row) if self.delimiter else json.dumps(row))+"\n" + super().__init__() + self._path = params['path'] + self._delimiter = params['delimiter'] + + # def meta(self): + # return self.cache['meta'] + # def isready(self): + # """ + # This function determines if the class is ready for execution or not + # i.e it determines if the preconditions of met prior execution + # """ + # return True + # # p = self.path is not None and os.path.exists(self.path) + # # q = self.name is not None + # # return p and q + # def format (self,row): + # self.cache['meta']['cols'] += len(row) if isinstance(row,list) else len(row.keys()) + # self.cache['meta']['rows'] += 1 + # return (self.delimiter.join(row) if self.delimiter else json.dumps(row))+"\n" def write(self,info,**_args): """ This function writes a record to a designated file @@ -97,21 +88,30 @@ class DiskWriter(Writer): @param row row to be written """ try: + _mode = 'a' if 'overwrite' not in _args else 'w' DiskWriter.THREAD_LOCK.acquire() - f = open(self.path,_mode) - if self.delimiter : - if type(info) == list : - for row in info : - f.write(self.format(row)) - else: - f.write(self.format(info)) - else: - if not type(info) == str : - f.write(json.dumps(info)+"\n") - else: - f.write(info) - f.close() + # # _path = _args['path'] if 'path' in _args else self.path + # # _delim= _args['delimiter'] if 'delimiter' in _args else self._delimiter + # # info.to_csv(_path,sep=_delim) + # info.to_csv(self.path) + # f = open(self.path,_mode) + # if self.delimiter : + # if type(info) == list : + # for row in info : + # f.write(self.format(row)) + # else: + # f.write(self.format(info)) + # else: + # if not type(info) == str : + # f.write(json.dumps(info)+"\n") + # else: + # f.write(info) + # f.close() + _delim = self._delimiter if 'delimiter' not in _args else _args['delimiter'] + _path = self.path if 'path' not in _args else _args['path'] + info.to_csv(_path,index=False,sep=_delim) + pass except Exception as e: # # Not sure what should be done here ... @@ -220,16 +220,19 @@ class SQLiteWriter(SQLite,DiskWriter) : # # If the table doesn't exist we should create it # - def write(self,info): + def write(self,info,**_args): """ """ if not self.fields : + if type(info) == pd.DataFrame : + _columns = list(info.columns) self.init(list(info.keys())) if type(info) == dict : info = [info] elif type(info) == pd.DataFrame : + info = info.fillna('') info = info.to_dict(orient='records') SQLiteWriter.LOCK.acquire() diff --git a/transport/etl.py b/transport/etl.py index dac58c4..aa4a73e 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -70,7 +70,7 @@ class Transporter(Process): # self._onerror = _args['onError'] self._source = _args['source'] self._target = _args['target'] - + # # Let's insure we can support multiple targets self._target = [self._target] if type(self._target) != list else self._target @@ -90,16 +90,18 @@ class Transporter(Process): This function will write a data-frame to a designated data-store, The function is built around a delegation design pattern :data data-frame or object to be written """ - for _target in self._target : - if 'write' not in _target : - _target['context'] = 'write' - _target['lock'] = True - else: - _target['write']['lock'] = True - _writer = transport.factory.instance(**_target) - _writer.write(_data,**_args) - if hasattr(_writer,'close') : - _writer.close() + if _data.shape[0] > 0 : + for _target in self._target : + if 'write' not in _target : + _target['context'] = 'write' + # _target['lock'] = True + else: + # _target['write']['lock'] = True + pass + _writer = transport.factory.instance(**_target) + _writer.write(_data.copy(),**_args) + if hasattr(_writer,'close') : + _writer.close() def write(self,_df,**_args): """ @@ -109,12 +111,12 @@ class Transporter(Process): # _df = self.read() _segments = np.array_split(np.range(_df.shape[0]),SEGMENT_COUNT) if _df.shape[0] > MAX_ROWS else np.array( [np.arange(_df.shape[0])]) # _index = 0 - + for _indexes in _segments : _fwd_args = {} if not _args else _args - self._delegate_write(_df.iloc[_indexes],**_fwd_args) + self._delegate_write(_df.iloc[_indexes]) # # @TODO: Perhaps consider writing up each segment in a thread/process (speeds things up?) pass From 4320159f3d930f74ff84c537d7c62992f6d2ebb1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 30 Sep 2023 01:17:35 -0500 Subject: [PATCH 162/271] bug fixes --- transport/disk.py | 29 +++++++---------------------- transport/etl.py | 2 +- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/transport/disk.py b/transport/disk.py index a3880ec..1d966c7 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -65,7 +65,7 @@ class DiskWriter(Writer): super().__init__() self._path = params['path'] self._delimiter = params['delimiter'] - + self._mode = 'w' if 'mode' not in params else params['mode'] # def meta(self): # return self.cache['meta'] # def isready(self): @@ -89,28 +89,13 @@ class DiskWriter(Writer): """ try: - _mode = 'a' if 'overwrite' not in _args else 'w' - DiskWriter.THREAD_LOCK.acquire() - # # _path = _args['path'] if 'path' in _args else self.path - # # _delim= _args['delimiter'] if 'delimiter' in _args else self._delimiter - # # info.to_csv(_path,sep=_delim) - # info.to_csv(self.path) - # f = open(self.path,_mode) - # if self.delimiter : - # if type(info) == list : - # for row in info : - # f.write(self.format(row)) - # else: - # f.write(self.format(info)) - # else: - # if not type(info) == str : - # f.write(json.dumps(info)+"\n") - # else: - # f.write(info) - # f.close() + + DiskWriter.THREAD_LOCK.acquire() + _delim = self._delimiter if 'delimiter' not in _args else _args['delimiter'] - _path = self.path if 'path' not in _args else _args['path'] - info.to_csv(_path,index=False,sep=_delim) + _path = self._path if 'path' not in _args else _args['path'] + _mode = self._mode if 'mode' not in _args else _args['mode'] + info.to_csv(_path,index=False,sep=_delim, mode=_mode) pass except Exception as e: # diff --git a/transport/etl.py b/transport/etl.py index aa4a73e..b2e0e6a 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -116,7 +116,7 @@ class Transporter(Process): for _indexes in _segments : _fwd_args = {} if not _args else _args - self._delegate_write(_df.iloc[_indexes]) + self._delegate_write(_df.iloc[_indexes],**_fwd_args) # # @TODO: Perhaps consider writing up each segment in a thread/process (speeds things up?) pass From a7fe357b2c4687cc736bcaaf56a7de71de6fa162 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 30 Sep 2023 09:24:58 -0500 Subject: [PATCH 163/271] bug fixes, write file and other misc with ETL --- transport/disk.py | 2 +- transport/etl.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/transport/disk.py b/transport/disk.py index 1d966c7..d8ee757 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -95,7 +95,7 @@ class DiskWriter(Writer): _delim = self._delimiter if 'delimiter' not in _args else _args['delimiter'] _path = self._path if 'path' not in _args else _args['path'] _mode = self._mode if 'mode' not in _args else _args['mode'] - info.to_csv(_path,index=False,sep=_delim, mode=_mode) + info.to_csv(_path,index=False,sep=_delim) pass except Exception as e: # diff --git a/transport/etl.py b/transport/etl.py index b2e0e6a..198bdf0 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -99,7 +99,7 @@ class Transporter(Process): # _target['write']['lock'] = True pass _writer = transport.factory.instance(**_target) - _writer.write(_data.copy(),**_args) + _writer.write(_data,**_args) if hasattr(_writer,'close') : _writer.close() From 5660d8ba593f34a677759b575b58436dfef8a53f Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 11 Nov 2023 10:30:58 -0600 Subject: [PATCH 164/271] nextcloud handling --- transport/disk.py | 2 +- transport/nextcloud.py | 76 ++++++++++++++++++++++++++++++++++++++++++ transport/providers.py | 18 ++++++---- transport/version.py | 2 +- 4 files changed, 90 insertions(+), 8 deletions(-) create mode 100644 transport/nextcloud.py diff --git a/transport/disk.py b/transport/disk.py index d8ee757..f092a3d 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -64,7 +64,7 @@ class DiskWriter(Writer): def __init__(self,**params): super().__init__() self._path = params['path'] - self._delimiter = params['delimiter'] + self._delimiter = params['delimiter'] if 'delimiter' in params else None self._mode = 'w' if 'mode' not in params else params['mode'] # def meta(self): # return self.cache['meta'] diff --git a/transport/nextcloud.py b/transport/nextcloud.py new file mode 100644 index 0000000..457eb83 --- /dev/null +++ b/transport/nextcloud.py @@ -0,0 +1,76 @@ +""" +We are implementing transport to and from nextcloud (just like s3) +""" +import os +import sys +from transport.common import Reader,Writer +import pandas as pd +from io import StringIO +import json +import nextcloud_client as nextcloud + +class Nextcloud : + def __init__(self,**_args): + pass + self._delimiter = None + self._handler = nextcloud.Client(_args['url']) + _uid = _args['uid'] + _token = _args['token'] + self._uri = _args['folder'] if 'folder' in _args else './' + if self._uri.endswith('/') : + self._uri = self._uri[:-1] + self._file = None if 'file' not in _args else _args['file'] + self._handler.login(_uid,_token) + def close(self): + try: + self._handler.logout() + except Exception as e: + pass + + +class NextcloudReader(Nextcloud,Reader): + def __init__(self,**_args): + # self._file = [] if 'file' not in _args else _args['file'] + super().__init__(**_args) + pass + def read(self,**_args): + _filename = self._file if 'file' not in _args else _args['file'] + # + # @TODO: if _filename is none, an exception should be raised + # + _uri = '/'.join([self._uri,_filename]) + if self._handler.get_file(_uri) : + # + # + _info = self._handler.file_info(_uri) + _content = self._handler.get_file_contents(_uri).decode('utf8') + if _info.get_content_type() == 'text/csv' : + _file = StringIO(_content) + return pd.read_csv(_file) + else: + return _content + return None +class NextcloudWriter (Nextcloud,Writer): + """ + This class will write data to an instance of nextcloud + """ + def __init__(self,**_args) : + super().__init__(**_args) + self + def write(self,_data,**_args): + """ + This function will upload a file to a given destination + :file has the uri of the location of the file + """ + _filename = self._file if 'file' not in _args else _args['file'] + _uri = '/'.join([self._uri,_filename]) + if type(_data) == pd.DataFrame : + f = StringIO() + _data.to_csv(f,index=False) + _content = f.getvalue() + elif type(_data) == dict : + _content = json.dumps(_data) + else: + _content = str(_data) + self._handler.put_file_contents(_uri,_content) + diff --git a/transport/providers.py b/transport/providers.py index fc394f3..a798960 100644 --- a/transport/providers.py +++ b/transport/providers.py @@ -10,6 +10,7 @@ from transport import etl as etl from transport import qlistener from transport import bricks from transport import session +from transport import nextcloud import psycopg2 as pg import mysql.connector as my from google.cloud import bigquery as bq @@ -34,7 +35,7 @@ MARIADB = 'mariadb' COUCHDB = 'couch' CONSOLE = 'console' ETL = 'etl' - +NEXTCLOUD = 'nextcloud' # # synonyms of the above @@ -49,18 +50,19 @@ AWS_S3 = 's3' RABBIT = RABBITMQ QLISTENER = 'qlistener' +QUEUE = QLISTENER DATABRICKS= 'databricks+connector' DRIVERS = {PG:pg,REDSHIFT:pg,MYSQL:my,MARIADB:my,NETEZZA:nz,SQLITE:sqlite3} -CATEGORIES ={'sql':[NETEZZA,PG,MYSQL,REDSHIFT,SQLITE,MARIADB],'nosql':[MONGODB,COUCHDB],'cloud':[BIGQUERY,DATABRICKS],'file':[FILE], - 'queue':[RABBIT,QLISTENER],'memory':[CONSOLE,QLISTENER],'http':[HTTP]} +CATEGORIES ={'sql':[NETEZZA,PG,MYSQL,REDSHIFT,SQLITE,MARIADB],'nosql':[MONGODB,COUCHDB],'cloud':[NEXTCLOUD,S3,BIGQUERY,DATABRICKS],'file':[FILE], + 'queue':[RABBIT,QLISTENER],'memory':[CONSOLE,QUEUE],'http':[HTTP]} READ = {'sql':sql.SQLReader,'nosql':{MONGODB:mongo.MongoReader,COUCHDB:couch.CouchReader}, - 'cloud':{BIGQUERY:sql.BigQueryReader,DATABRICKS:bricks.BricksReader}, + 'cloud':{BIGQUERY:sql.BigQueryReader,DATABRICKS:bricks.BricksReader,NEXTCLOUD:nextcloud.NextcloudReader}, 'file':disk.DiskReader,'queue':{RABBIT:queue.QueueReader,QLISTENER:qlistener.qListener}, # 'cli':{CONSOLE:Console},'memory':{CONSOLE:Console},'http':session.HttpReader } WRITE = {'sql':sql.SQLWriter,'nosql':{MONGODB:mongo.MongoWriter,COUCHDB:couch.CouchWriter}, - 'cloud':{BIGQUERY:sql.BigQueryWriter,DATABRICKS:bricks.BricksWriter}, + 'cloud':{BIGQUERY:sql.BigQueryWriter,DATABRICKS:bricks.BricksWriter,NEXTCLOUD:nextcloud.NextcloudWriter}, 'file':disk.DiskWriter,'queue':{RABBIT:queue.QueueWriter,QLISTENER:qlistener.qListener}, # 'cli':{CONSOLE:Console}, # 'memory':{CONSOLE:Console}, 'http':session.HttpReader @@ -78,12 +80,16 @@ PROVIDERS = { MYSQL:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':my,'default':{'host':'localhost','port':3306}}, MARIADB:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':my,'default':{'host':'localhost','port':3306}}, + S3:{'read':s3.s3Reader,'write':s3.s3Writer}, BIGQUERY:{'read':sql.BigQueryReader,'write':sql.BigQueryWriter}, + DATABRICKS:{'read':bricks.BricksReader,'write':bricks.BricksWriter}, + NEXTCLOUD:{'read':nextcloud.NextcloudReader,'write':nextcloud.NextcloudWriter}, + QLISTENER:{'read':qlistener.qListener,'write':qlistener.qListener,'default':{'host':'localhost','port':5672}}, CONSOLE:{'read':qlistener.Console,"write":qlistener.Console}, HTTP:{'read':session.HttpReader,'write':session.HttpWriter}, - DATABRICKS:{'read':bricks.BricksReader,'write':bricks.BricksWriter}, + MONGODB:{'read':mongo.MongoReader,'write':mongo.MongoWriter,'default':{'port':27017,'host':'localhost'}}, COUCHDB:{'read':couch.CouchReader,'writer':couch.CouchWriter,'default':{'host':'localhost','port':5984}}, ETL :{'read':etl.Transporter,'write':etl.Transporter} diff --git a/transport/version.py b/transport/version.py index 5e7e7b7..3fa6e8d 100644 --- a/transport/version.py +++ b/transport/version.py @@ -1,2 +1,2 @@ __author__ = 'The Phi Technology' -__version__= '1.9.0' +__version__= '1.9.2' From 0930eb0f5c2378a4b9f09616f46e8072e364c02c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 11 Nov 2023 10:37:32 -0600 Subject: [PATCH 165/271] nextcloud dependency pyncclient --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 254bb5c..c322c38 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ args = { "license":"MIT", "packages":["transport"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pymongo','sqlalchemy<2.0.0','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] +args["install_requires"] = ['pyncclient','pymongo','sqlalchemy<2.0.0','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] if sys.version_info[0] == 2 : From 7d29a69a232dec91357c69c020dfa70ec79be037 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 21 Nov 2023 11:01:05 -0600 Subject: [PATCH 166/271] reminder --- transport/nextcloud.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/transport/nextcloud.py b/transport/nextcloud.py index 457eb83..f096f70 100644 --- a/transport/nextcloud.py +++ b/transport/nextcloud.py @@ -45,9 +45,13 @@ class NextcloudReader(Nextcloud,Reader): _info = self._handler.file_info(_uri) _content = self._handler.get_file_contents(_uri).decode('utf8') if _info.get_content_type() == 'text/csv' : + # + # @TODO: enable handling of csv, xls, parquet, pickles _file = StringIO(_content) return pd.read_csv(_file) else: + # + # if it is neither a structured document like csv, we will return the content as is return _content return None class NextcloudWriter (Nextcloud,Writer): From 9da2894b07fb47b583382173433e8ac579de9855 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 30 Nov 2023 12:05:04 -0600 Subject: [PATCH 167/271] bug fixes with sqlite and provider --- transport/disk.py | 15 +++++++++------ transport/providers.py | 3 ++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/transport/disk.py b/transport/disk.py index f092a3d..956386d 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -209,17 +209,20 @@ class SQLiteWriter(SQLite,DiskWriter) : """ """ - if not self.fields : - if type(info) == pd.DataFrame : - _columns = list(info.columns) - self.init(list(info.keys())) + #if not self.fields : + # #if type(info) == pd.DataFrame : + # # _columns = list(info.columns) + # #self.init(list(info.keys())) if type(info) == dict : info = [info] elif type(info) == pd.DataFrame : info = info.fillna('') info = info.to_dict(orient='records') - + if not self.fields : + _rec = info[0] + self.init(list(_rec.keys())) + SQLiteWriter.LOCK.acquire() try: @@ -238,4 +241,4 @@ class SQLiteWriter(SQLite,DiskWriter) : except Exception as e : print (e) pass - SQLiteWriter.LOCK.release() \ No newline at end of file + SQLiteWriter.LOCK.release() diff --git a/transport/providers.py b/transport/providers.py index a798960..23843e7 100644 --- a/transport/providers.py +++ b/transport/providers.py @@ -72,6 +72,7 @@ WRITE = {'sql':sql.SQLWriter,'nosql':{MONGODB:mongo.MongoWriter,COUCHDB:couch.Co PROVIDERS = { FILE:{'read':disk.DiskReader,'write':disk.DiskWriter}, SQLITE:{'read':disk.SQLiteReader,'write':disk.SQLiteWriter,'driver':sqlite3}, + 'sqlite3':{'read':disk.SQLiteReader,'write':disk.SQLiteWriter,'driver':sqlite3}, POSTGRESQL:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':pg,'default':{'host':'localhost','port':5432}}, NETEZZA:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':nz,'default':{'port':5480}}, @@ -98,4 +99,4 @@ DEFAULT = {PG:{'host':'localhost','port':5432},MYSQL:{'host':'localhost','port': DEFAULT[MONGODB] = {'port':27017,'host':'localhost'} DEFAULT[REDSHIFT] = DEFAULT[PG] DEFAULT[MARIADB] = DEFAULT[MYSQL] -DEFAULT[NETEZZA] = {'port':5480} \ No newline at end of file +DEFAULT[NETEZZA] = {'port':5480} From 214f276ae430f02679a458c4fb563e6d0b0a55f4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 30 Nov 2023 12:05:44 -0600 Subject: [PATCH 168/271] bug fixes with sqlite and provider --- transport/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/version.py b/transport/version.py index 3fa6e8d..2b34f5b 100644 --- a/transport/version.py +++ b/transport/version.py @@ -1,2 +1,2 @@ __author__ = 'The Phi Technology' -__version__= '1.9.2' +__version__= '1.9.4' From 5a5922b736f15f6a1228d037ca2a199873365768 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 30 Nov 2023 12:12:14 -0600 Subject: [PATCH 169/271] version update --- transport/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/version.py b/transport/version.py index 2b34f5b..3fa6e8d 100644 --- a/transport/version.py +++ b/transport/version.py @@ -1,2 +1,2 @@ __author__ = 'The Phi Technology' -__version__= '1.9.4' +__version__= '1.9.2' From 56bdda17b70b23c0b0af4b413432bc65b43a3654 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 30 Nov 2023 12:40:57 -0600 Subject: [PATCH 170/271] minor bug fix --- transport/disk.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/transport/disk.py b/transport/disk.py index 956386d..42b5b33 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -219,9 +219,10 @@ class SQLiteWriter(SQLite,DiskWriter) : elif type(info) == pd.DataFrame : info = info.fillna('') info = info.to_dict(orient='records') - if not self.fields : - _rec = info[0] - self.init(list(_rec.keys())) + + if not self.fields : + _rec = info[0] + self.init(list(_rec.keys())) SQLiteWriter.LOCK.acquire() try: From d74372f645630fb100a4cb7d2afa2c421b426df4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 8 Dec 2023 18:19:46 -0600 Subject: [PATCH 171/271] bug fixes: mongodb, common, nextcloud --- transport/__init__.py | 24 ++++++++++++++---------- transport/common.py | 15 ++++++++++++++- transport/disk.py | 7 ++++++- transport/mongo.py | 6 +++--- transport/nextcloud.py | 4 ++-- 5 files changed, 39 insertions(+), 17 deletions(-) diff --git a/transport/__init__.py b/transport/__init__.py index e139aa5..234c418 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -27,6 +27,7 @@ import json import importlib import sys import sqlalchemy +from datetime import datetime if sys.version_info[0] > 2 : # from transport.common import Reader, Writer,Console #, factory from transport import disk @@ -83,16 +84,19 @@ import os # PGSQL = POSTGRESQL # import providers -class IEncoder (json.JSONEncoder): - def default (self,object): - if type(object) == np.integer : - return int(object) - elif type(object) == np.floating: - return float(object) - elif type(object) == np.ndarray : - return object.tolist() - else: - return super(IEncoder,self).default(object) +# class IEncoder (json.JSONEncoder): +def IEncoder (self,object): + if type(object) == np.integer : + return int(object) + elif type(object) == np.floating: + return float(object) + elif type(object) == np.ndarray : + return object.tolist() + elif type(object) == datetime : + return o.isoformat() + else: + return super(IEncoder,self).default(object) + class factory : TYPE = {"sql":{"providers":["postgresql","mysql","neteeza","bigquery","mariadb","redshift"]}} PROVIDERS = { diff --git a/transport/common.py b/transport/common.py index 59f57ea..8b9f718 100644 --- a/transport/common.py +++ b/transport/common.py @@ -25,7 +25,7 @@ from multiprocessing import RLock import queue # import couch # import mongo - +from datetime import datetime class IO: def init(self,**args): @@ -39,6 +39,19 @@ class IO: continue value = args[field] setattr(self,field,value) +class IEncoder (json.JSONEncoder): + def default (self,object): + if type(object) == np.integer : + return int(object) + elif type(object) == np.floating: + return float(object) + elif type(object) == np.ndarray : + return object.tolist() + elif type(object) == datetime : + return object.isoformat() + else: + return super(IEncoder,self).default(object) + class Reader (IO): """ This class is an abstraction of a read functionalities of a data store diff --git a/transport/disk.py b/transport/disk.py index 42b5b33..2c9f6c8 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -12,6 +12,8 @@ import json import sqlite3 import pandas as pd from multiprocessing import Lock +from transport.common import Reader, Writer, IEncoder + class DiskReader(Reader) : """ This class is designed to read data from disk (location on hard drive) @@ -221,6 +223,8 @@ class SQLiteWriter(SQLite,DiskWriter) : info = info.to_dict(orient='records') if not self.fields : + + _rec = info[0] self.init(list(_rec.keys())) @@ -231,7 +235,8 @@ class SQLiteWriter(SQLite,DiskWriter) : sql = " " .join(["INSERT INTO ",self.table,"(", ",".join(self.fields) ,")", "values(:values)"]) for row in info : stream =["".join(["",value,""]) if type(value) == str else value for value in row.values()] - stream = json.dumps(stream).replace("[","").replace("]","") + stream = json.dumps(stream,cls=IEncoder) + stream = stream.replace("[","").replace("]","") self.conn.execute(sql.replace(":values",stream) ) diff --git a/transport/mongo.py b/transport/mongo.py index c24b4b8..bac1780 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -15,7 +15,7 @@ import gridfs # from transport import Reader,Writer import sys if sys.version_info[0] > 2 : - from transport.common import Reader, Writer + from transport.common import Reader, Writer, IEncoder else: from common import Reader, Writer import json @@ -102,7 +102,7 @@ class MongoReader(Mongo,Reader): if 'pipeline' in args : cmd['pipeline']= args['pipeline'] if 'aggregate' not in cmd : - cmd['aggregate'] = self.collection + cmd['aggregate'] = self.uid if 'pipeline' not in args or 'aggregate' not in cmd : cmd = args['mongo'] if 'mongo' in args else args['cmd'] if "aggregate" in cmd : @@ -182,7 +182,7 @@ class MongoWriter(Mongo,Writer): for row in rows : if type(row['_id']) == ObjectId : row['_id'] = str(row['_id']) - stream = Binary(json.dumps(collection).encode()) + stream = Binary(json.dumps(collection,cls=IEncoder).encode()) collection.delete_many({}) now = "-".join([str(datetime.now().year()),str(datetime.now().month), str(datetime.now().day)]) name = ".".join([self.uid,'archive',now])+".json" diff --git a/transport/nextcloud.py b/transport/nextcloud.py index f096f70..2eefd51 100644 --- a/transport/nextcloud.py +++ b/transport/nextcloud.py @@ -3,7 +3,7 @@ We are implementing transport to and from nextcloud (just like s3) """ import os import sys -from transport.common import Reader,Writer +from transport.common import Reader,Writer, IEncoder import pandas as pd from io import StringIO import json @@ -73,7 +73,7 @@ class NextcloudWriter (Nextcloud,Writer): _data.to_csv(f,index=False) _content = f.getvalue() elif type(_data) == dict : - _content = json.dumps(_data) + _content = json.dumps(_data,cls=IEncoder) else: _content = str(_data) self._handler.put_file_contents(_uri,_content) From e46ebadcc2c0a383d838c83e6ddba7a82dae8162 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 11 Dec 2023 22:10:53 -0600 Subject: [PATCH 172/271] bug fix --- transport/disk.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/transport/disk.py b/transport/disk.py index 2c9f6c8..424e95e 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -223,8 +223,6 @@ class SQLiteWriter(SQLite,DiskWriter) : info = info.to_dict(orient='records') if not self.fields : - - _rec = info[0] self.init(list(_rec.keys())) From fbfaaebbdc4300c137a0209d1a83f9ebecc6ac21 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 20 Dec 2023 10:14:15 -0600 Subject: [PATCH 173/271] adding alias CALLBACK == QLISTENER --- transport/providers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transport/providers.py b/transport/providers.py index 23843e7..93b6f53 100644 --- a/transport/providers.py +++ b/transport/providers.py @@ -51,6 +51,7 @@ RABBIT = RABBITMQ QLISTENER = 'qlistener' QUEUE = QLISTENER +CALLBACK = QLISTENER DATABRICKS= 'databricks+connector' DRIVERS = {PG:pg,REDSHIFT:pg,MYSQL:my,MARIADB:my,NETEZZA:nz,SQLITE:sqlite3} CATEGORIES ={'sql':[NETEZZA,PG,MYSQL,REDSHIFT,SQLITE,MARIADB],'nosql':[MONGODB,COUCHDB],'cloud':[NEXTCLOUD,S3,BIGQUERY,DATABRICKS],'file':[FILE], From 81bc5a3ba155067806cedd4ac1128ff3d1aaca27 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 22 Dec 2023 14:02:32 -0600 Subject: [PATCH 174/271] bug fix: bigquery chunk/batch sizes --- transport/sql.py | 7 ++++--- transport/version.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/transport/sql.py b/transport/sql.py index 019db78..7be3900 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -32,7 +32,7 @@ import nzpy as nz #--- netezza drivers import sqlite3 import copy import os - +import time class SQLRW : lock = RLock() @@ -357,7 +357,7 @@ class SQLWriter(SQLRW,Writer): # # Writing with schema information ... rows = _info.iloc[i].to_sql(self.table,self._engine,schema=self.schema,if_exists='append',index=False) - + time.sleep(1) else: _fields = ",".join(self.fields) _sql = _sql.replace(":fields",_fields) @@ -495,10 +495,11 @@ class BQWriter(BigQuery,Writer): # _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) # # Let us adjust the chunking here - self._chunkks = 10 if _df.shape[0] > SQLRW.MAX_CHUNK and self._chunks == 1 else self._chunks + self._chunks = 10 if _df.shape[0] > SQLRW.MAX_CHUNK and self._chunks == 1 else self._chunks _indexes = np.array_split(np.arange(_df.shape[0]),self._chunks) for i in _indexes : _df.iloc[i].to_gbq(**self.mode) + time.sleep(1) pass # # Aliasing the big query classes allowing it to be backward compatible diff --git a/transport/version.py b/transport/version.py index 3fa6e8d..5ad4744 100644 --- a/transport/version.py +++ b/transport/version.py @@ -1,2 +1,2 @@ __author__ = 'The Phi Technology' -__version__= '1.9.2' +__version__= '1.9.3' From 92bf0600c3cd912c201a2c00f49a39a3aab176ad Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 22 Dec 2023 14:16:40 -0600 Subject: [PATCH 175/271] .. --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 0d20b64..68b5ef7 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,6 @@ +build *.pyc +*.csv +*.json +*.swp +*.egg-info From 88b0437bbc6d09c3a2d3bb2ebc0fee9769b60a96 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 26 Dec 2023 14:51:28 -0600 Subject: [PATCH 176/271] bug fix: etl jobs --- transport/etl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/etl.py b/transport/etl.py index 198bdf0..fcc3e85 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -109,7 +109,7 @@ class Transporter(Process): SEGMENT_COUNT = 6 MAX_ROWS = 1000000 # _df = self.read() - _segments = np.array_split(np.range(_df.shape[0]),SEGMENT_COUNT) if _df.shape[0] > MAX_ROWS else np.array( [np.arange(_df.shape[0])]) + _segments = np.array_split(np.arange(_df.shape[0]),SEGMENT_COUNT) if _df.shape[0] > MAX_ROWS else np.array( [np.arange(_df.shape[0])]) # _index = 0 From 01d92c48b49def69c2739580b10ac03782fdfc83 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 26 Dec 2023 14:56:03 -0600 Subject: [PATCH 177/271] job delay ... --- transport/etl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transport/etl.py b/transport/etl.py index 198bdf0..2b0699e 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -117,6 +117,7 @@ class Transporter(Process): _fwd_args = {} if not _args else _args self._delegate_write(_df.iloc[_indexes],**_fwd_args) + time.sleep(1) # # @TODO: Perhaps consider writing up each segment in a thread/process (speeds things up?) pass From cedab73e19a1637c14de4267167d9533357b62b5 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 8 Jan 2024 09:54:26 -0600 Subject: [PATCH 178/271] bigquery: job submission and status of said jobs --- transport/sql.py | 19 ++++++++++++++++++- transport/version.py | 2 +- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/transport/sql.py b/transport/sql.py index 7be3900..3176cf7 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -25,6 +25,8 @@ else: import json from google.oauth2 import service_account from google.cloud import bigquery as bq +# import constants.bq_utils as bq_consts + from multiprocessing import Lock, RLock import pandas as pd import numpy as np @@ -462,7 +464,7 @@ class BQWriter(BigQuery,Writer): self.table = _args['table'] if 'table' in _args else None self.mode = {'if_exists':'append','chunksize':900000,'destination_table':self.table,'credentials':self.credentials} self._chunks = 1 if 'chunks' not in _args else int(_args['chunks']) - + self._location = 'US' if 'location' not in _args else _args['location'] def write(self,_info,**_args) : try: if self.parallel or 'lock' in _args : @@ -472,6 +474,21 @@ class BQWriter(BigQuery,Writer): finally: if self.parallel: BQWriter.lock.release() + def submit(self,_sql): + """ + Write the output of a massive query to a given table, biquery will handle this as a job + This function will return the job identifier + """ + _config = bq.QueryJobConfig() + _config.destination = self.client.dataset(self.dataset).table(self.table) + _config.allow_large_results = True + # _config.write_disposition = bq.bq_consts.WRITE_APPEND + _config.dry_run = False + # _config.priority = 'BATCH' + _resp = self.client.query(_sql,location=self._location,job_config=_config) + return _resp.job_id + def status (self,_id): + return self.client.get_job(_id,location=self._location) def _write(self,_info,**_args) : _df = None if type(_info) in [list,pd.DataFrame] : diff --git a/transport/version.py b/transport/version.py index 5ad4744..2b34f5b 100644 --- a/transport/version.py +++ b/transport/version.py @@ -1,2 +1,2 @@ __author__ = 'The Phi Technology' -__version__= '1.9.3' +__version__= '1.9.4' From c27beb16cc77238e3021f62a28bf147eca31e945 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 8 Jan 2024 10:24:29 -0600 Subject: [PATCH 179/271] providers ... version 2.0 prep --- transport/providers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transport/providers.py b/transport/providers.py index 93b6f53..95da002 100644 --- a/transport/providers.py +++ b/transport/providers.py @@ -35,6 +35,7 @@ MARIADB = 'mariadb' COUCHDB = 'couch' CONSOLE = 'console' ETL = 'etl' +TRANSPORT = ETL NEXTCLOUD = 'nextcloud' # From c6ff08ea6b91876dae41afdd1f95469f72bbf9c0 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 15 Jan 2024 15:08:18 -0600 Subject: [PATCH 180/271] bug fix: added info package with version & author --- transport/version.py => info/__init__.py | 0 setup.py | 7 +- transport/__init__.py | 83 ++++++++++++------------ transport/mongo.py | 16 ++++- version.py | 1 - 5 files changed, 62 insertions(+), 45 deletions(-) rename transport/version.py => info/__init__.py (100%) delete mode 120000 version.py diff --git a/transport/version.py b/info/__init__.py similarity index 100% rename from transport/version.py rename to info/__init__.py diff --git a/setup.py b/setup.py index c322c38..9eb2c3e 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,10 @@ This is a build file for the from setuptools import setup, find_packages import os import sys -from version import __version__,__author__ +# from version import __version__,__author__ +from info import __version__, __author__ + + # __author__ = 'The Phi Technology' # __version__= '1.8.0' @@ -15,7 +18,7 @@ args = { "version":__version__, "author":__author__,"author_email":"info@the-phi.com", "license":"MIT", - "packages":["transport"]} + "packages":["transport","info"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] args["install_requires"] = ['pyncclient','pymongo','sqlalchemy<2.0.0','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" diff --git a/transport/__init__.py b/transport/__init__.py index 234c418..7b37a12 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -21,8 +21,8 @@ The configuration for the data-store is as follows : provider:'mongodb',[port:27017],[host:localhost],db:,doc:<_name>,context: """ -import pandas as pd -import numpy as np +# import pandas as pd +# import numpy as np import json import importlib import sys @@ -38,7 +38,8 @@ if sys.version_info[0] > 2 : from transport import mongo as mongo from transport import sql as sql from transport import etl as etl - from transport.version import __version__ + # from transport.version import __version__ + from info import __version__,__author__ from transport import providers else: from common import Reader, Writer,Console #, factory @@ -51,10 +52,10 @@ else: import etl from version import __version__ import providers -import psycopg2 as pg -import mysql.connector as my -from google.cloud import bigquery as bq -import nzpy as nz #--- netezza drivers +# import psycopg2 as pg +# import mysql.connector as my +# from google.cloud import bigquery as bq +# import nzpy as nz #--- netezza drivers import os # class providers : @@ -85,42 +86,42 @@ import os # import providers # class IEncoder (json.JSONEncoder): -def IEncoder (self,object): - if type(object) == np.integer : - return int(object) - elif type(object) == np.floating: - return float(object) - elif type(object) == np.ndarray : - return object.tolist() - elif type(object) == datetime : - return o.isoformat() - else: - return super(IEncoder,self).default(object) +# def IEncoder (self,object): +# if type(object) == np.integer : +# return int(object) +# elif type(object) == np.floating: +# return float(object) +# elif type(object) == np.ndarray : +# return object.tolist() +# elif type(object) == datetime : +# return o.isoformat() +# else: +# return super(IEncoder,self).default(object) class factory : - TYPE = {"sql":{"providers":["postgresql","mysql","neteeza","bigquery","mariadb","redshift"]}} - PROVIDERS = { - "etl":{"class":{"read":etl.instance,"write":etl.instance}}, - # "console":{"class":{"write":Console,"read":Console}}, - "file":{"class":{"read":disk.DiskReader,"write":disk.DiskWriter}}, - "sqlite":{"class":{"read":disk.SQLiteReader,"write":disk.SQLiteWriter}}, - "postgresql":{"port":5432,"host":"localhost","database":None,"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - "redshift":{"port":5432,"host":"localhost","database":None,"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - "bigquery":{"class":{"read":sql.BQReader,"write":sql.BQWriter}}, - "mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my,"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my,"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - "mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}}, - "couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}}, - "netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - "rabbitmq":{"port":5672,"host":"localhost","class":{"read":queue.QueueReader,"write":queue.QueueWriter,"listen":queue.QueueListener,"listener":queue.QueueListener},"default":{"type":"application/json"}}} - # - # creating synonyms - PROVIDERS['mongodb'] = PROVIDERS['mongo'] - PROVIDERS['couchdb'] = PROVIDERS['couch'] - PROVIDERS['bq'] = PROVIDERS['bigquery'] - PROVIDERS['sqlite3'] = PROVIDERS['sqlite'] - PROVIDERS['rabbit'] = PROVIDERS['rabbitmq'] - PROVIDERS['rabbitmq-server'] = PROVIDERS['rabbitmq'] + # TYPE = {"sql":{"providers":["postgresql","mysql","neteeza","bigquery","mariadb","redshift"]}} + # PROVIDERS = { + # "etl":{"class":{"read":etl.instance,"write":etl.instance}}, + # # "console":{"class":{"write":Console,"read":Console}}, + # "file":{"class":{"read":disk.DiskReader,"write":disk.DiskWriter}}, + # "sqlite":{"class":{"read":disk.SQLiteReader,"write":disk.SQLiteWriter}}, + # "postgresql":{"port":5432,"host":"localhost","database":None,"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, + # "redshift":{"port":5432,"host":"localhost","database":None,"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, + # "bigquery":{"class":{"read":sql.BQReader,"write":sql.BQWriter}}, + # "mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my,"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, + # "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my,"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, + # "mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}}, + # "couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}}, + # "netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, + # "rabbitmq":{"port":5672,"host":"localhost","class":{"read":queue.QueueReader,"write":queue.QueueWriter,"listen":queue.QueueListener,"listener":queue.QueueListener},"default":{"type":"application/json"}}} + # # + # # creating synonyms + # PROVIDERS['mongodb'] = PROVIDERS['mongo'] + # PROVIDERS['couchdb'] = PROVIDERS['couch'] + # PROVIDERS['bq'] = PROVIDERS['bigquery'] + # PROVIDERS['sqlite3'] = PROVIDERS['sqlite'] + # PROVIDERS['rabbit'] = PROVIDERS['rabbitmq'] + # PROVIDERS['rabbitmq-server'] = PROVIDERS['rabbitmq'] @staticmethod def instance(**_args): diff --git a/transport/mongo.py b/transport/mongo.py index bac1780..b4338b4 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -15,12 +15,26 @@ import gridfs # from transport import Reader,Writer import sys if sys.version_info[0] > 2 : - from transport.common import Reader, Writer, IEncoder + from transport.common import Reader, Writer else: from common import Reader, Writer import json import re from multiprocessing import Lock, RLock + +def IEncoder (self,object): + if type(object) == np.integer : + return int(object) + elif type(object) == np.floating: + return float(object) + elif type(object) == np.ndarray : + return object.tolist() + elif type(object) == datetime : + return o.isoformat() + else: + return super(IEncoder,self).default(object) + + class Mongo : lock = RLock() """ diff --git a/version.py b/version.py deleted file mode 120000 index e666b28..0000000 --- a/version.py +++ /dev/null @@ -1 +0,0 @@ -transport/version.py \ No newline at end of file From 483e5e81ed1b316cdf08894f2eb08b1a3f3158a3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 15 Jan 2024 15:08:56 -0600 Subject: [PATCH 181/271] version infomation --- info/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/info/__init__.py b/info/__init__.py index 2b34f5b..ff8bcbb 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,2 +1,2 @@ __author__ = 'The Phi Technology' -__version__= '1.9.4' +__version__= '1.9.6' From 1ca99dc820c01e7d21ff9657acde37c11926e2dd Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 15 Jan 2024 16:52:12 -0600 Subject: [PATCH 182/271] bug fix: information --- transport/providers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/providers.py b/transport/providers.py index 95da002..3f70a38 100644 --- a/transport/providers.py +++ b/transport/providers.py @@ -17,7 +17,7 @@ from google.cloud import bigquery as bq import nzpy as nz #--- netezza drivers import os -from transport.version import __version__ +from info import __version__ POSTGRESQL = 'postgresql' MONGODB = 'mongodb' From 35b261edbfe1bfd051c7d3803cff77582d2b33ba Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 19 Jan 2024 23:55:35 -0600 Subject: [PATCH 183/271] bug fix: etl, mongodb --- info/__init__.py | 12 ++++++++++++ transport/mongo.py | 25 +++++++------------------ transport/providers.py | 3 ++- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/info/__init__.py b/info/__init__.py index ff8bcbb..d5e703a 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,2 +1,14 @@ __author__ = 'The Phi Technology' __version__= '1.9.6' +__license__=""" + + +Copyright 2010 - 2024, Steve L. Nyemba + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +""" diff --git a/transport/mongo.py b/transport/mongo.py index b4338b4..c7b5ed8 100644 --- a/transport/mongo.py +++ b/transport/mongo.py @@ -15,26 +15,12 @@ import gridfs # from transport import Reader,Writer import sys if sys.version_info[0] > 2 : - from transport.common import Reader, Writer + from transport.common import Reader, Writer, IEncoder else: from common import Reader, Writer import json import re from multiprocessing import Lock, RLock - -def IEncoder (self,object): - if type(object) == np.integer : - return int(object) - elif type(object) == np.floating: - return float(object) - elif type(object) == np.ndarray : - return object.tolist() - elif type(object) == datetime : - return o.isoformat() - else: - return super(IEncoder,self).default(object) - - class Mongo : lock = RLock() """ @@ -93,7 +79,7 @@ class Mongo : q = self.uid in self.client[self.dbname].list_collection_names() return p and q def setattr(self,key,value): - _allowed = ['host','port','db','doc','authSource','mechanism'] + _allowed = ['host','port','db','doc','collection','authSource','mechanism'] if key in _allowed : setattr(self,key,value) pass @@ -113,10 +99,13 @@ class MongoReader(Mongo,Reader): # # @TODO: cmd = {} + if 'aggregate' not in cmd and 'aggregate' not in args: + cmd['aggregate'] = self.uid + elif 'aggregate' in args : + cmd['aggregate'] = args['aggregate'] if 'pipeline' in args : cmd['pipeline']= args['pipeline'] - if 'aggregate' not in cmd : - cmd['aggregate'] = self.uid + if 'pipeline' not in args or 'aggregate' not in cmd : cmd = args['mongo'] if 'mongo' in args else args['cmd'] if "aggregate" in cmd : diff --git a/transport/providers.py b/transport/providers.py index 3f70a38..ddb2fcb 100644 --- a/transport/providers.py +++ b/transport/providers.py @@ -95,7 +95,8 @@ PROVIDERS = { MONGODB:{'read':mongo.MongoReader,'write':mongo.MongoWriter,'default':{'port':27017,'host':'localhost'}}, COUCHDB:{'read':couch.CouchReader,'writer':couch.CouchWriter,'default':{'host':'localhost','port':5984}}, - ETL :{'read':etl.Transporter,'write':etl.Transporter} +# ETL :{'read':etl.Transporter,'write':etl.Transporter} + ETL :{'read':etl.instance,'write':etl.instance} } DEFAULT = {PG:{'host':'localhost','port':5432},MYSQL:{'host':'localhost','port':3306}} DEFAULT[MONGODB] = {'port':27017,'host':'localhost'} From e4faac7d6e3a62b5d73c6f109942bac3a81dbad9 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 2 Feb 2024 18:11:20 -0600 Subject: [PATCH 184/271] bug fixes: sqlite writer with sqlalchemy --- info/__init__.py | 2 +- transport/__init__.py | 2 +- transport/disk.py | 19 +++++++++++++++++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/info/__init__.py b/info/__init__.py index d5e703a..1ff6503 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,5 +1,5 @@ __author__ = 'The Phi Technology' -__version__= '1.9.6' +__version__= '1.9.8' __license__=""" diff --git a/transport/__init__.py b/transport/__init__.py index 7b37a12..c820978 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -50,7 +50,7 @@ else: import s3 import sql import etl - from version import __version__ + from info import __version__,__author__ import providers # import psycopg2 as pg # import mysql.connector as my diff --git a/transport/disk.py b/transport/disk.py index 424e95e..2f39867 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -13,7 +13,8 @@ import sqlite3 import pandas as pd from multiprocessing import Lock from transport.common import Reader, Writer, IEncoder - +import sqlalchemy +from sqlalchemy import create_engine class DiskReader(Reader) : """ This class is designed to read data from disk (location on hard drive) @@ -22,6 +23,7 @@ class DiskReader(Reader) : def __init__(self,**params): """ + @param path absolute path of the file to be read """ @@ -111,6 +113,8 @@ class SQLite : self.conn = sqlite3.connect(self.path,isolation_level="IMMEDIATE") self.conn.row_factory = sqlite3.Row self.fields = _args['fields'] if 'fields' in _args else [] + path = self._path + self._engine = create_engine(f'sqlite://{path}') def has (self,**_args): found = False try: @@ -207,7 +211,18 @@ class SQLiteWriter(SQLite,DiskWriter) : # # If the table doesn't exist we should create it # - def write(self,info,**_args): + def write(self,_data,**_args): + SQLiteWriter.LOCK.acquire() + try: + if type(_data) == dict : + _data = [_data] + _table = self.table if 'table' not in _args else _args['table'] + _df = pd.DataFrame(_data) + _df.to_sql(_table,self._engine.connect()) + except Exception as e: + print (e) + SQLiteWriter.LOCK.release() + def _write(self,info,**_args): """ """ From 9270695c950a99d15f970818247675411c91e90b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 2 Feb 2024 18:16:34 -0600 Subject: [PATCH 185/271] bug fixes: sqlite writer with sqlalchemy --- transport/disk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transport/disk.py b/transport/disk.py index 2f39867..94c903c 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -113,8 +113,6 @@ class SQLite : self.conn = sqlite3.connect(self.path,isolation_level="IMMEDIATE") self.conn.row_factory = sqlite3.Row self.fields = _args['fields'] if 'fields' in _args else [] - path = self._path - self._engine = create_engine(f'sqlite://{path}') def has (self,**_args): found = False try: @@ -178,6 +176,8 @@ class SQLiteWriter(SQLite,DiskWriter) : # DiskWriter.__init__(self,**args) super().__init__(**args) self.table = args['table'] if 'table' in args else None + path = self.path + self._engine = create_engine(f'sqlite://{path}') # self.conn = sqlite3.connect(self.path,isolation_level="IMMEDIATE") # self.conn.row_factory = sqlite3.Row From 0ac7e68bbd1e6b79a905c61c881b331ab27b87fe Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 2 Feb 2024 18:22:17 -0600 Subject: [PATCH 186/271] bug fixes: sqlite writer with sqlalchemy --- transport/disk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/disk.py b/transport/disk.py index 94c903c..f5f8a97 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -177,7 +177,7 @@ class SQLiteWriter(SQLite,DiskWriter) : super().__init__(**args) self.table = args['table'] if 'table' in args else None path = self.path - self._engine = create_engine(f'sqlite://{path}') + self._engine = create_engine(f'sqlite:///{path}') # self.conn = sqlite3.connect(self.path,isolation_level="IMMEDIATE") # self.conn.row_factory = sqlite3.Row From c3fd371cb70a6f19941bbcae6cd27c05effe1626 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 2 Feb 2024 19:18:23 -0600 Subject: [PATCH 187/271] bug fixes: sqlite writer with sqlalchemy --- transport/disk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/disk.py b/transport/disk.py index f5f8a97..fab5eb1 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -218,7 +218,7 @@ class SQLiteWriter(SQLite,DiskWriter) : _data = [_data] _table = self.table if 'table' not in _args else _args['table'] _df = pd.DataFrame(_data) - _df.to_sql(_table,self._engine.connect()) + _df.to_sql(_table,self._engine.connect()if_exists='append',index=False) except Exception as e: print (e) SQLiteWriter.LOCK.release() From c3ebd32a40b3df91ece89118b8c072d3f144b2a6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 2 Feb 2024 19:19:23 -0600 Subject: [PATCH 188/271] bug fixes: sqlite writer with sqlalchemy --- transport/disk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/disk.py b/transport/disk.py index fab5eb1..e31a88f 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -218,7 +218,7 @@ class SQLiteWriter(SQLite,DiskWriter) : _data = [_data] _table = self.table if 'table' not in _args else _args['table'] _df = pd.DataFrame(_data) - _df.to_sql(_table,self._engine.connect()if_exists='append',index=False) + _df.to_sql(_table,self._engine.connect(),if_exists='append',index=False) except Exception as e: print (e) SQLiteWriter.LOCK.release() From 0f60c748baf45834c6c9f90c7bf7e7bd1f70f34d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 5 Feb 2024 22:35:14 -0600 Subject: [PATCH 189/271] bug fix --- transport/disk.py | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/transport/disk.py b/transport/disk.py index e31a88f..5e43b69 100644 --- a/transport/disk.py +++ b/transport/disk.py @@ -211,18 +211,18 @@ class SQLiteWriter(SQLite,DiskWriter) : # # If the table doesn't exist we should create it # - def write(self,_data,**_args): - SQLiteWriter.LOCK.acquire() - try: - if type(_data) == dict : - _data = [_data] - _table = self.table if 'table' not in _args else _args['table'] - _df = pd.DataFrame(_data) - _df.to_sql(_table,self._engine.connect(),if_exists='append',index=False) - except Exception as e: - print (e) - SQLiteWriter.LOCK.release() - def _write(self,info,**_args): + # def write(self,_data,**_args): + # SQLiteWriter.LOCK.acquire() + # try: + # if type(_data) == dict : + # _data = [_data] + # _table = self.table if 'table' not in _args else _args['table'] + # _df = pd.DataFrame(_data) + # _df.to_sql(_table,self._engine.connect(),if_exists='append',index=False) + # except Exception as e: + # print (e) + # SQLiteWriter.LOCK.release() + def write(self,info,**_args): """ """ @@ -247,17 +247,23 @@ class SQLiteWriter(SQLite,DiskWriter) : cursor = self.conn.cursor() sql = " " .join(["INSERT INTO ",self.table,"(", ",".join(self.fields) ,")", "values(:values)"]) for row in info : - stream =["".join(["",value,""]) if type(value) == str else value for value in row.values()] - stream = json.dumps(stream,cls=IEncoder) - stream = stream.replace("[","").replace("]","") - + values = [ str(row[field]) if type(row[field]) not in [list,dict] else json.dumps(row[field],cls=IEncoder) for field in self.fields] + values = ["".join(["'",value,"'"]) for value in values] + + # stream =["".join(["",value,""]) if type(value) == str else value for value in row.values()] + # stream = json.dumps(stream,cls=IEncoder) + # stream = stream.replace("[","").replace("]","") - self.conn.execute(sql.replace(":values",stream) ) + # print (sql.replace(":values",stream)) + # self.conn.execute(sql.replace(":values",stream) ) + self.conn.execute(sql.replace(":values", ",".join(values)) ) # cursor.commit() self.conn.commit() # print (sql) except Exception as e : + print () + print (e) pass SQLiteWriter.LOCK.release() From 95ad4c1a6b5cac653ad1d57f7bca1e61570737c1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 19 Feb 2024 15:36:09 -0600 Subject: [PATCH 190/271] version 2.0-RC (2024) --- info/__init__.py | 2 +- setup.py | 18 +----------------- transport/sql.py | 5 +++-- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/info/__init__.py b/info/__init__.py index 1ff6503..9e789d5 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,5 +1,5 @@ __author__ = 'The Phi Technology' -__version__= '1.9.8' +__version__= '2.0-RC' __license__=""" diff --git a/setup.py b/setup.py index 9eb2c3e..df12cb4 100644 --- a/setup.py +++ b/setup.py @@ -20,26 +20,10 @@ args = { "license":"MIT", "packages":["transport","info"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pyncclient','pymongo','sqlalchemy<2.0.0','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] +args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] if sys.version_info[0] == 2 : args['use_2to3'] = True args['use_2to3_exclude_fixers']=['lib2to3.fixes.fix_import'] setup(**args) -# setup( -# name = "data-transport", -# version = "1.0", -# author = "The Phi Technology LLC", -# author_email = "steve@the-phi.com", -# license = "MIT", -# packages=['transport'], -# keywords=['mongodb','couchdb','rabbitmq','file','read','write','s3'], -# install_requires = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open'], -# url="https://dev.the-phi.com/git/steve/data-transport.git", -# use_2to3=True, -# long_description=read('README.md'), -# convert_2to3_doctests=['README.md'], -# #use_2to3_fixers=['your.fixers'], -# use_2to3_exclude_fixers=['lib2to3.fixes.fix_import'], -# ) diff --git a/transport/sql.py b/transport/sql.py index 3176cf7..c5b52d4 100644 --- a/transport/sql.py +++ b/transport/sql.py @@ -29,6 +29,7 @@ from google.cloud import bigquery as bq from multiprocessing import Lock, RLock import pandas as pd +import pandas_gbq as pd_gbq import numpy as np import nzpy as nz #--- netezza drivers import sqlite3 @@ -409,7 +410,7 @@ class BigQuery: _dataset = self.dataset if 'dataset' not in _args else _args['dataset'] sql = f"""SELECT column_name as name, data_type as type FROM {_dataset}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{table}' """ _info = {'credentials':self.credentials,'dialect':'standard'} - return pd.read_gbq(sql,**_info).to_dict(orient='records') + return pd_gbq.read_gbq(sql,**_info).to_dict(orient='records') # return self.read(sql=sql).to_dict(orient='records') # ref = self.client.dataset(self.dataset).table(table) @@ -451,7 +452,7 @@ class BQReader(BigQuery,Reader) : if (':dataset' in SQL or ':DATASET' in SQL) and self.dataset: SQL = SQL.replace(':dataset',self.dataset).replace(':DATASET',self.dataset) _info = {'credentials':self.credentials,'dialect':'standard'} - return pd.read_gbq(SQL,**_info) if SQL else None + return pd_gbq.read_gbq(SQL,**_info) if SQL else None # return self.client.query(SQL).to_dataframe() if SQL else None From 8d2eb62218f2935f734cd1dc355b5f5f132a4325 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 20 Feb 2024 09:08:05 -0600 Subject: [PATCH 191/271] bug fix with 2024 pandas & sqlalchemy --- info/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/info/__init__.py b/info/__init__.py index 9e789d5..9ffaa4d 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,5 +1,5 @@ __author__ = 'The Phi Technology' -__version__= '2.0-RC' +__version__= '1.9.8.1' __license__=""" From 764b3d6af08c61edae45b34333762805438e1e61 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 27 Feb 2024 12:37:16 -0600 Subject: [PATCH 192/271] bug fix: psycopg2 with numpy --- info/__init__.py | 2 +- setup.py | 2 +- transport/__init__.py | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/info/__init__.py b/info/__init__.py index 9ffaa4d..57f7289 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,5 +1,5 @@ __author__ = 'The Phi Technology' -__version__= '1.9.8.1' +__version__= '1.9.8.20' __license__=""" diff --git a/setup.py b/setup.py index df12cb4..40ba3fb 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ args = { "license":"MIT", "packages":["transport","info"]} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python'] +args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] if sys.version_info[0] == 2 : diff --git a/transport/__init__.py b/transport/__init__.py index c820978..9e12b3f 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -52,6 +52,11 @@ else: import etl from info import __version__,__author__ import providers + +import numpy as np +from psycopg2.extensions import register_adapter, AsIs +register_adapter(np.int64, AsIs) + # import psycopg2 as pg # import mysql.connector as my # from google.cloud import bigquery as bq From e7838f5de121e0e26604d3a87aa919a92c429f09 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 28 Mar 2024 15:34:39 -0500 Subject: [PATCH 193/271] refactoring version 2.0 --- bin/transport | 47 +-- info/__init__.py | 2 +- transport/__init__.py | 425 +++++--------------------- transport/cloud/__init__.py | 6 + transport/cloud/bigquery.py | 156 ++++++++++ transport/cloud/databricks.py | 111 +++++++ transport/cloud/nextcloud.py | 80 +++++ transport/cloud/s3.py | 127 ++++++++ transport/nosql/__init__.py | 10 + transport/nosql/couchdb.py | 213 +++++++++++++ transport/nosql/mongodb.py | 242 +++++++++++++++ transport/other/__init__.py | 1 + transport/other/callback.py | 45 +++ transport/other/console.py | 7 + transport/other/files.py | 68 +++++ transport/other/http.py | 88 ++++++ transport/other/rabbitmq.py | 272 +++++++++++++++++ transport/providers.py | 105 ------- transport/providers/__init__.py | 44 +++ transport/sql.py | 526 -------------------------------- transport/sql/__init__.py | 18 ++ transport/sql/common.py | 125 ++++++++ transport/sql/mysql.py | 18 ++ transport/sql/netezza.py | 15 + transport/sql/postgresql.py | 22 ++ transport/sql/sqlite.py | 25 ++ 26 files changed, 1775 insertions(+), 1023 deletions(-) create mode 100644 transport/cloud/__init__.py create mode 100644 transport/cloud/bigquery.py create mode 100644 transport/cloud/databricks.py create mode 100644 transport/cloud/nextcloud.py create mode 100644 transport/cloud/s3.py create mode 100644 transport/nosql/__init__.py create mode 100644 transport/nosql/couchdb.py create mode 100644 transport/nosql/mongodb.py create mode 100644 transport/other/__init__.py create mode 100644 transport/other/callback.py create mode 100644 transport/other/console.py create mode 100644 transport/other/files.py create mode 100644 transport/other/http.py create mode 100644 transport/other/rabbitmq.py delete mode 100644 transport/providers.py create mode 100644 transport/providers/__init__.py delete mode 100644 transport/sql.py create mode 100644 transport/sql/__init__.py create mode 100644 transport/sql/common.py create mode 100644 transport/sql/mysql.py create mode 100644 transport/sql/netezza.py create mode 100644 transport/sql/postgresql.py create mode 100644 transport/sql/sqlite.py diff --git a/bin/transport b/bin/transport index dd424a2..363d2d9 100755 --- a/bin/transport +++ b/bin/transport @@ -48,24 +48,8 @@ import typer import os import transport from transport import etl -from transport import providers +# from transport import providers -# SYS_ARGS = {} -# if len(sys.argv) > 1: - -# N = len(sys.argv) -# for i in range(1,N): -# value = None -# if sys.argv[i].startswith('--'): -# key = sys.argv[i][2:] #.replace('-','') -# SYS_ARGS[key] = 1 -# if i + 1 < N: -# value = sys.argv[i + 1] = sys.argv[i+1].strip() -# if key and value and not value.startswith('--'): -# SYS_ARGS[key] = value - - -# i += 2 app = typer.Typer() @@ -77,7 +61,7 @@ def wait(jobs): jobs = [thread for thread in jobs if thread.is_alive()] time.sleep(1) -@app.command() +@app.command(name="apply") def move (path,index=None): _proxy = lambda _object: _object.write(_object.read()) @@ -90,27 +74,14 @@ def move (path,index=None): etl.instance(**_config) else: etl.instance(config=_config) - - # - # if type(_config) == dict : - # _object = transport.etl.instance(**_config) - # _proxy(_object) - # else: - # # - # # here we are dealing with a list of objects (long ass etl job) - # jobs = [] - # failed = [] - # for _args in _config : - # if index and _config.index(_args) != index : - # continue +@app.command(name="providers") +def supported (format:str="table") : + """ + This function will print supported providers and their associated classifications + """ + _df = (transport.supported()) + print (json.dumps(_df.to_dict(orient="list"))) - # _object=transport.etl.instance(**_args) - # thread = Process(target=_proxy,args=(_object,)) - # thread.start() - # jobs.append(thread()) - # if _config.index(_args) == 0 : - # thread.join() - # wait(jobs) @app.command() def version(): print (transport.version.__version__) diff --git a/info/__init__.py b/info/__init__.py index 57f7289..2d27032 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,5 +1,5 @@ __author__ = 'The Phi Technology' -__version__= '1.9.8.20' +__version__= '2.0.0' __license__=""" diff --git a/transport/__init__.py b/transport/__init__.py index 9e12b3f..288f646 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -11,360 +11,79 @@ This library is designed to serve as a wrapper to a set of supported data stores - s3 - sqlite The supported operations are read/write and providing meta data to the calling code -Requirements : - pymongo - boto - couldant -The configuration for the data-store is as follows : - e.g: - mongodb - provider:'mongodb',[port:27017],[host:localhost],db:,doc:<_name>,context: +We separated reads from writes to mitigate accidents associated with writes. +Source Code is available under MIT License: + https://healthcareio.the-phi.com/data-transport + https://hiplab.mc.vanderbilt.edu/git/hiplab/data-transport """ - -# import pandas as pd -# import numpy as np -import json -import importlib -import sys -import sqlalchemy -from datetime import datetime -if sys.version_info[0] > 2 : - # from transport.common import Reader, Writer,Console #, factory - from transport import disk - - from transport import s3 as s3 - from transport import rabbitmq as queue - from transport import couch as couch - from transport import mongo as mongo - from transport import sql as sql - from transport import etl as etl - # from transport.version import __version__ - from info import __version__,__author__ - from transport import providers -else: - from common import Reader, Writer,Console #, factory - import disk - import queue - import couch - import mongo - import s3 - import sql - import etl - from info import __version__,__author__ - import providers - import numpy as np -from psycopg2.extensions import register_adapter, AsIs -register_adapter(np.int64, AsIs) - -# import psycopg2 as pg -# import mysql.connector as my -# from google.cloud import bigquery as bq -# import nzpy as nz #--- netezza drivers +from transport import sql, nosql, cloud, other +import pandas as pd +import json import os - -# class providers : -# POSTGRESQL = 'postgresql' -# MONGODB = 'mongodb' - -# BIGQUERY ='bigquery' -# FILE = 'file' -# ETL = 'etl' -# SQLITE = 'sqlite' -# SQLITE3= 'sqlite' -# REDSHIFT = 'redshift' -# NETEZZA = 'netezza' -# MYSQL = 'mysql' -# RABBITMQ = 'rabbitmq' -# MARIADB = 'mariadb' -# COUCHDB = 'couch' -# CONSOLE = 'console' -# ETL = 'etl' -# # -# # synonyms of the above -# BQ = BIGQUERY -# MONGO = MONGODB -# FERRETDB= MONGODB -# PG = POSTGRESQL -# PSQL = POSTGRESQL -# PGSQL = POSTGRESQL -# import providers - -# class IEncoder (json.JSONEncoder): -# def IEncoder (self,object): -# if type(object) == np.integer : -# return int(object) -# elif type(object) == np.floating: -# return float(object) -# elif type(object) == np.ndarray : -# return object.tolist() -# elif type(object) == datetime : -# return o.isoformat() -# else: -# return super(IEncoder,self).default(object) - +from info import __version__,__author__ + +PROVIDERS = {} +def init(): + global PROVIDERS + for _module in [cloud,sql,nosql,other] : + for _provider_name in dir(_module) : + if _provider_name.startswith('__') : + continue + PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__} +# print ([ {name:getattr(sql,name)} for name in dir(sql) if not name.startswith('__')]) + +def instance (**_args): + """ + type: + read: true|false (default true) + auth_file + """ + global PROVIDERS + if 'auth_file' in _args: + if os.path.exists(_args['auth_file']) : + f = open(_args['auth_file']) + _args = dict (_args,** json.loads(f.read()) ) + f.close() + else: + filename = _args['auth_file'] + raise Exception(f" {filename} was not found or is invalid") + if _args['provider'] in PROVIDERS : + _info = PROVIDERS[_args['provider']] + _module = _info['module'] + if 'context' in _args : + _context = _args['context'] + else: + _context = 'read' + _pointer = getattr(_module,'Reader') if _context == 'read' else getattr(_module,'Writer') + return _pointer (**_args) + pass + else: + raise Exception ("Missing or Unknown provider") + pass +def supported (): + _info = {} + for _provider in PROVIDERS : + _item = PROVIDERS[_provider] + if _item['type'] not in _info : + _info[_item['type']] = [] + _info[_item['type']].append(_provider) + _df = pd.DataFrame() + for _id in _info : + if not _df.shape[0] : + _df = pd.DataFrame(_info[_id],columns=[_id.replace('transport.','')]) + else: + _df = pd.DataFrame(_info[_id],columns=[_id.replace('transport.','')]).join(_df, how='outer') + return _df.fillna('') class factory : - # TYPE = {"sql":{"providers":["postgresql","mysql","neteeza","bigquery","mariadb","redshift"]}} - # PROVIDERS = { - # "etl":{"class":{"read":etl.instance,"write":etl.instance}}, - # # "console":{"class":{"write":Console,"read":Console}}, - # "file":{"class":{"read":disk.DiskReader,"write":disk.DiskWriter}}, - # "sqlite":{"class":{"read":disk.SQLiteReader,"write":disk.SQLiteWriter}}, - # "postgresql":{"port":5432,"host":"localhost","database":None,"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - # "redshift":{"port":5432,"host":"localhost","database":None,"driver":pg,"default":{"type":"VARCHAR"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - # "bigquery":{"class":{"read":sql.BQReader,"write":sql.BQWriter}}, - # "mysql":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my,"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - # "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my,"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - # "mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}}, - # "couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}}, - # "netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"},"class":{"read":sql.SQLReader,"write":sql.SQLWriter}}, - # "rabbitmq":{"port":5672,"host":"localhost","class":{"read":queue.QueueReader,"write":queue.QueueWriter,"listen":queue.QueueListener,"listener":queue.QueueListener},"default":{"type":"application/json"}}} - # # - # # creating synonyms - # PROVIDERS['mongodb'] = PROVIDERS['mongo'] - # PROVIDERS['couchdb'] = PROVIDERS['couch'] - # PROVIDERS['bq'] = PROVIDERS['bigquery'] - # PROVIDERS['sqlite3'] = PROVIDERS['sqlite'] - # PROVIDERS['rabbit'] = PROVIDERS['rabbitmq'] - # PROVIDERS['rabbitmq-server'] = PROVIDERS['rabbitmq'] - - @staticmethod - def instance(**_args): - if 'type' in _args : - # - # Legacy code being returned - return factory._instance(**_args); - - - - else: - return instance(**_args) - @staticmethod - def _instance(**args): - """ - This class will create an instance of a transport when providing - :type name of the type we are trying to create - :args The arguments needed to create the instance - """ - source = args['type'] - params = args['args'] - anObject = None - - if source in ['HttpRequestReader','HttpSessionWriter']: - # - # @TODO: Make sure objects are serializable, be smart about them !! - # - aClassName = ''.join([source,'(**params)']) - - - else: - - stream = json.dumps(params) - aClassName = ''.join([source,'(**',stream,')']) - - try: - anObject = eval( aClassName) - #setattr(anObject,'name',source) - except Exception as e: - print(['Error ',e]) - return anObject - -import time -def instance(**_pargs): - """ - creating an instance given the provider, we should have an idea of :class, :driver - :provider - :read|write = {connection to the database} - """ - # - # @TODO: provide authentication file that will hold all the parameters, that will later on be used - # - _args = dict(_pargs,**{}) - if 'auth_file' in _args : - path = _args['auth_file'] - file = open(path) - _config = json.loads( file.read()) - _args = dict(_args,**_config) - file.close() - - _provider = _args['provider'] - _context = list( set(['read','write','listen']) & set(_args.keys()) ) - if _context : - _context = _context[0] - else: - _context = _args['context'] if 'context' in _args else 'read' - # _group = None - - - # for _id in providers.CATEGORIES : - # if _provider in providers.CATEGORIES[_id] : - # _group = _id - # break - # if _group : - - if _provider in providers.PROVIDERS and _context in providers.PROVIDERS[_provider]: - - # _classPointer = _getClassInstance(_group,**_args) - _classPointer = providers.PROVIDERS[_provider][_context] - # - # Let us reformat the arguments - # if 'read' in _args or 'write' in _args : - # _args = _args['read'] if 'read' in _args else _args['write'] - # _args['provider'] = _provider - # if _group == 'sql' : - if _provider in providers.CATEGORIES['sql'] : - _info = _get_alchemyEngine(**_args) - - _args = dict(_args,**_info) - _args['driver'] = providers.DRIVERS[_provider] - - else: - if _provider in providers.DEFAULT : - _default = providers.DEFAULT[_provider] - _defkeys = list(set(_default.keys()) - set(_args.keys())) - if _defkeys : - for key in _defkeys : - _args[key] = _default[key] - pass - # - # get default values from - - return _classPointer(**_args) - # - # Let us determine the category of the provider that has been given -def _get_alchemyEngine(**_args): - """ - This function returns the SQLAlchemy engine associated with parameters, This is only applicable for SQL _items - :_args arguments passed to the factory {provider and other} - """ - _provider = _args['provider'] - _pargs = {} - if _provider == providers.SQLITE3 : - _path = _args['database'] if 'database' in _args else _args['path'] - uri = ''.join([_provider,':///',_path]) - - else: - - #@TODO: Enable authentication files (private_key) - _username = _args['username'] if 'username' in _args else '' - _password = _args['password'] if 'password' in _args else '' - _account = _args['account'] if 'account' in _args else '' - _database = _args['database'] if 'database' in _args else _args['path'] - - if _username != '': - _account = _username + ':'+_password+'@' - _host = _args['host'] if 'host' in _args else '' - _port = _args['port'] if 'port' in _args else '' - if _provider in providers.DEFAULT : - _default = providers.DEFAULT[_provider] - _host = _host if _host != '' else (_default['host'] if 'host' in _default else '') - _port = _port if _port != '' else (_default['port'] if 'port' in _default else '') - if _port == '': - _port = providers.DEFAULT['port'] if 'port' in providers.DEFAULT else '' - # - - if _host != '' and _port != '' : - _fhost = _host+":"+str(_port) #--formatted hostname - else: - _fhost = _host - # Let us update the parameters we have thus far - # - - - uri = ''.join([_provider,"://",_account,_fhost,'/',_database]) - _pargs = {'host':_host,'port':_port,'username':_username,'password':_password} - _engine = sqlalchemy.create_engine (uri,future=True) - _out = {'sqlalchemy':_engine} - - for key in _pargs : - if _pargs[key] != '' : - _out[key] = _pargs[key] - return _out -@DeprecationWarning -def _getClassInstance(_group,**_args): - """ - This function returns the class instance we are attempting to instanciate - :_group items in providers.CATEGORIES.keys() - :_args arguments passed to the factory class - """ - # if 'read' in _args or 'write' in _args : - # _context = 'read' if 'read' in _args else _args['write'] - # _info = _args[_context] - # else: - # _context = _args['context'] if 'context' in _args else 'read' - # _class = providers.READ[_group] if _context == 'read' else providers.WRITE[_group] - # if type(_class) == dict and _args['provider'] in _class: - # _class = _class[_args['provider']] - - # return _class - -@DeprecationWarning -def __instance(**_args): - """ - - @param provider {file,sqlite,postgresql,redshift,bigquery,netezza,mongo,couch ...} - @param context read|write|rw - @param _args argument to got with the datastore (username,password,host,port ...) - """ - - provider = _args['provider'] - context = _args['context']if 'context' in _args else None - _id = context if context in list(factory.PROVIDERS[provider]['class'].keys()) else 'read' - if _id : - args = {'provider':_id} - for key in factory.PROVIDERS[provider] : - if key == 'class' : - continue - value = factory.PROVIDERS[provider][key] - args[key] = value - # - # - - args = dict(args,**_args) - - # print (provider in factory.PROVIDERS) - if 'class' in factory.PROVIDERS[provider]: - pointer = factory.PROVIDERS[provider]['class'][_id] - else: - pointer = sql.SQLReader if _id == 'read' else sql.SQLWriter - # - # Let us try to establish an sqlalchemy wrapper - try: - account = '' - host = '' - if provider not in [providers.BIGQUERY,providers.MONGODB, providers.COUCHDB, providers.SQLITE, providers.CONSOLE,providers.ETL, providers.FILE, providers.RABBITMQ] : - # if provider not in ['bigquery','mongodb','mongo','couchdb','sqlite','console','etl','file','rabbitmq'] : - # - # In these cases we are assuming RDBMS and thus would exclude NoSQL and BigQuery - username = args['username'] if 'username' in args else '' - password = args['password'] if 'password' in args else '' - if username == '' : - account = '' - else: - account = username + ':'+password+'@' - host = args['host'] - if 'port' in args : - host = host+":"+str(args['port']) - - database = args['database'] - elif provider in [providers.SQLITE,providers.FILE]: - account = '' - host = '' - database = args['path'] if 'path' in args else args['database'] - - if provider not in [providers.MONGODB, providers.COUCHDB, providers.BIGQUERY, providers.CONSOLE, providers.ETL,providers.FILE,providers.RABBITMQ] : - # if provider not in ['mongodb','mongo','couchdb','bigquery','console','etl','file','rabbitmq'] : - uri = ''.join([provider,"://",account,host,'/',database]) - - e = sqlalchemy.create_engine (uri,future=True) - args['sqlalchemy'] = e - - # - # @TODO: Include handling of bigquery with SQLAlchemy - except Exception as e: - print (_args) - print (e) - - return pointer(**args) - - return None + pass +factory.instance = instance +init() +# if __name__ == '__main__' : +# # if not PROVIDERS : +# init() +# print (list(PROVIDERS.keys())) +# pgr = instance(provider='postgresql',database='io',table='foo',write=True) +# print (pgr.read()) +# print () +# print (supported()) \ No newline at end of file diff --git a/transport/cloud/__init__.py b/transport/cloud/__init__.py new file mode 100644 index 0000000..e741ed0 --- /dev/null +++ b/transport/cloud/__init__.py @@ -0,0 +1,6 @@ +""" +Steve L. Nyemba, nyemba@gmail.com +This namespace implements support for cloud databases databricks,bigquery ... +""" +from . import bigquery, databricks, nextcloud, s3 + diff --git a/transport/cloud/bigquery.py b/transport/cloud/bigquery.py new file mode 100644 index 0000000..479c060 --- /dev/null +++ b/transport/cloud/bigquery.py @@ -0,0 +1,156 @@ +""" +Implementing support for google's bigquery + - cloud.bigquery.Read + - cloud.bigquery.Write +""" +import json +from google.oauth2 import service_account +from google.cloud import bigquery as bq + +from multiprocessing import Lock, RLock +import pandas as pd +import pandas_gbq as pd_gbq +import numpy as np +import time + +MAX_CHUNK = 2000000 +class BigQuery: + def __init__(self,**_args): + path = _args['service_key'] if 'service_key' in _args else _args['private_key'] + self.credentials = service_account.Credentials.from_service_account_file(path) + self.dataset = _args['dataset'] if 'dataset' in _args else None + self.path = path + self.dtypes = _args['dtypes'] if 'dtypes' in _args else None + self.table = _args['table'] if 'table' in _args else None + self.client = bq.Client.from_service_account_json(self.path) + def meta(self,**_args): + """ + This function returns meta data for a given table or query with dataset/table properly formatted + :param table name of the name WITHOUT including dataset + :param sql sql query to be pulled, + """ + table = _args['table'] if 'table' in _args else self.table + + try: + if table : + _dataset = self.dataset if 'dataset' not in _args else _args['dataset'] + sql = f"""SELECT column_name as name, data_type as type FROM {_dataset}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{table}' """ + _info = {'credentials':self.credentials,'dialect':'standard'} + return pd_gbq.read_gbq(sql,**_info).to_dict(orient='records') + # return self.read(sql=sql).to_dict(orient='records') + # ref = self.client.dataset(self.dataset).table(table) + + # _schema = self.client.get_table(ref).schema + # return [{"name":_item.name,"type":_item.field_type,"description":( "" if not hasattr(_item,"description") else _item.description )} for _item in _schema] + else : + return [] + except Exception as e: + + return [] + def has(self,**_args): + found = False + try: + _has = self.meta(**_args) + found = _has is not None and len(_has) > 0 + except Exception as e: + pass + return found +class Reader (BigQuery): + """ + Implementing support for reading from bigquery, This class acts as a wrapper around google's API + """ + def __init__(self,**_args): + + super().__init__(**_args) + def apply(self,sql): + return self.read(sql=sql) + + def read(self,**_args): + SQL = None + table = self.table if 'table' not in _args else _args['table'] + if 'sql' in _args : + SQL = _args['sql'] + elif table: + + table = "".join(["`",table,"`"]) if '.' in table else "".join(["`:dataset.",table,"`"]) + SQL = "SELECT * FROM :table ".replace(":table",table) + if not SQL : + return None + if SQL and 'limit' in _args: + SQL += " LIMIT "+str(_args['limit']) + if (':dataset' in SQL or ':DATASET' in SQL) and self.dataset: + SQL = SQL.replace(':dataset',self.dataset).replace(':DATASET',self.dataset) + _info = {'credentials':self.credentials,'dialect':'standard'} + return pd_gbq.read_gbq(SQL,**_info) if SQL else None + # return self.client.query(SQL).to_dataframe() if SQL else None + +class Writer (BigQuery): + """ + This class implements support for writing against bigquery + """ + lock = RLock() + def __init__(self,**_args): + super().__init__(**_args) + + self.parallel = False if 'lock' not in _args else _args['lock'] + self.table = _args['table'] if 'table' in _args else None + self.mode = {'if_exists':'append','chunksize':900000,'destination_table':self.table,'credentials':self.credentials} + self._chunks = 1 if 'chunks' not in _args else int(_args['chunks']) + self._location = 'US' if 'location' not in _args else _args['location'] + def write(self,_data,**_args) : + """ + This function will perform a write to bigquery + :_data data-frame to be written to bigquery + """ + try: + if self.parallel or 'lock' in _args : + Write.lock.acquire() + _args['table'] = self.table if 'table' not in _args else _args['table'] + self._write(_data,**_args) + finally: + if self.parallel: + Write.lock.release() + def submit(self,_sql): + """ + Write the output of a massive query to a given table, biquery will handle this as a job + This function will return the job identifier + """ + _config = bq.QueryJobConfig() + _config.destination = self.client.dataset(self.dataset).table(self.table) + _config.allow_large_results = True + # _config.write_disposition = bq.bq_consts.WRITE_APPEND + _config.dry_run = False + # _config.priority = 'BATCH' + _resp = self.client.query(_sql,location=self._location,job_config=_config) + return _resp.job_id + def status (self,_id): + return self.client.get_job(_id,location=self._location) + def _write(self,_info,**_args) : + _df = None + if type(_info) in [list,pd.DataFrame] : + if type(_info) == list : + _df = pd.DataFrame(_info) + elif type(_info) == pd.DataFrame : + _df = _info + + if '.' not in _args['table'] : + self.mode['destination_table'] = '.'.join([self.dataset,_args['table']]) + else: + + self.mode['destination_table'] = _args['table'].strip() + if 'schema' in _args : + self.mode['table_schema'] = _args['schema'] + # + # Let us insure that the types are somewhat compatible ... + # _map = {'INTEGER':np.int64,'DATETIME':'datetime64[ns]','TIMESTAMP':'datetime64[ns]','FLOAT':np.float64,'DOUBLE':np.float64,'STRING':str} + # _mode = copy.deepcopy(self.mode) + _mode = self.mode + # _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) + # + # Let us adjust the chunking here + self._chunks = 10 if _df.shape[0] > MAX_CHUNK and self._chunks == 1 else self._chunks + _indexes = np.array_split(np.arange(_df.shape[0]),self._chunks) + for i in _indexes : + _df.iloc[i].to_gbq(**self.mode) + time.sleep(1) + pass \ No newline at end of file diff --git a/transport/cloud/databricks.py b/transport/cloud/databricks.py new file mode 100644 index 0000000..5c1ee0d --- /dev/null +++ b/transport/cloud/databricks.py @@ -0,0 +1,111 @@ +""" +This file implements databricks handling, This functionality will rely on databricks-sql-connector +LICENSE (MIT) +Copyright 2016-2020, The Phi Technology LLC + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +@TODO: + - Migrate SQLite to SQL hierarchy + - Include Write in Chunks from pandas +""" +import os +import sqlalchemy +# from transport.common import Reader,Writer +import pandas as pd + + +class Bricks: + """ + :host + :token + :database + :cluster_path + :table + """ + def __init__(self,**_args): + _host = _args['host'] + _token= _args['token'] + _cluster_path = _args['cluster_path'] + self._schema = _args['schema'] if 'schema' in _args else _args['database'] + _catalog = _args['catalog'] + self._table = _args['table'] if 'table' in _args else None + + # + # @TODO: + # Sometimes when the cluster isn't up and running it takes a while, the user should be alerted of this + # + + _uri = f'''databricks+connector://token:{_token}@{_host}?http_path={_cluster_path}&catalog={_catalog}&schema={self._schema}''' + self._engine = sqlalchemy.create_engine (_uri) + pass + def meta(self,**_args): + table = _args['table'] if 'table' in _args else self._table + if not table : + return [] + else: + if sqlalchemy.__version__.startswith('1.') : + _m = sqlalchemy.MetaData(bind=self._engine) + _m.reflect(only=[table]) + else: + _m = sqlalchemy.MetaData() + _m.reflect(bind=self._engine) + # + # Let's retrieve te information associated with a table + # + return [{'name':_attr.name,'type':_attr.type} for _attr in _m.tables[table].columns] + + def has(self,**_args): + return self.meta(**_args) + def apply(self,_sql): + try: + if _sql.lower().startswith('select') : + return pd.read_sql(_sql,self._engine) + except Exception as e: + pass + +class Reader(Bricks): + """ + This class is designed for reads and will execute reads against a table name or a select SQL statement + """ + def __init__(self,**_args): + super().__init__(**_args) + def read(self,**_args): + limit = None if 'limit' not in _args else str(_args['limit']) + + if 'sql' in _args : + sql = _args['sql'] + elif 'table' in _args : + table = _args['table'] + sql = f'SELECT * FROM {table}' + if limit : + sql = sql + f' LIMIT {limit}' + + if 'sql' in _args or 'table' in _args : + return self.apply(sql) + else: + return pd.DataFrame() + pass +class Writer(Bricks): + def __init__(self,**_args): + super().__init__(**_args) + def write(self,_data,**_args): + """ + This data will write data to data-bricks against a given table. If the table is not specified upon initiazation, it can be specified here + _data: data frame to push to databricks + _args: chunks, table, schema + """ + _schema = self._schema if 'schema' not in _args else _args['schema'] + _table = self._table if 'table' not in _args else _args['table'] + _df = _data if type(_data) == pd.DataFrame else _data + if type(_df) == dict : + _df = [_df] + if type(_df) == list : + _df = pd.DataFrame(_df) + _df.to_sql( + name=_table,schema=_schema, + con=self._engine,if_exists='append',index=False); + pass diff --git a/transport/cloud/nextcloud.py b/transport/cloud/nextcloud.py new file mode 100644 index 0000000..ebb44d3 --- /dev/null +++ b/transport/cloud/nextcloud.py @@ -0,0 +1,80 @@ +""" +We are implementing transport to and from nextcloud (just like s3) +""" +import os +import sys +from transport.common import IEncoder +import pandas as pd +from io import StringIO +import json +import nextcloud_client as nextcloud + +class Nextcloud : + def __init__(self,**_args): + pass + self._delimiter = None + self._handler = nextcloud.Client(_args['url']) + _uid = _args['uid'] + _token = _args['token'] + self._uri = _args['folder'] if 'folder' in _args else './' + if self._uri.endswith('/') : + self._uri = self._uri[:-1] + self._file = None if 'file' not in _args else _args['file'] + self._handler.login(_uid,_token) + def close(self): + try: + self._handler.logout() + except Exception as e: + pass + + +class Reader(Nextcloud): + def __init__(self,**_args): + # self._file = [] if 'file' not in _args else _args['file'] + super().__init__(**_args) + pass + def read(self,**_args): + _filename = self._file if 'file' not in _args else _args['file'] + # + # @TODO: if _filename is none, an exception should be raised + # + _uri = '/'.join([self._uri,_filename]) + if self._handler.get_file(_uri) : + # + # + _info = self._handler.file_info(_uri) + _content = self._handler.get_file_contents(_uri).decode('utf8') + if _info.get_content_type() == 'text/csv' : + # + # @TODO: enable handling of csv, xls, parquet, pickles + _file = StringIO(_content) + return pd.read_csv(_file) + else: + # + # if it is neither a structured document like csv, we will return the content as is + return _content + return None +class Writer (Nextcloud): + """ + This class will write data to an instance of nextcloud + """ + def __init__(self,**_args) : + super().__init__(**_args) + self + def write(self,_data,**_args): + """ + This function will upload a file to a given destination + :file has the uri of the location of the file + """ + _filename = self._file if 'file' not in _args else _args['file'] + _uri = '/'.join([self._uri,_filename]) + if type(_data) == pd.DataFrame : + f = StringIO() + _data.to_csv(f,index=False) + _content = f.getvalue() + elif type(_data) == dict : + _content = json.dumps(_data,cls=IEncoder) + else: + _content = str(_data) + self._handler.put_file_contents(_uri,_content) + diff --git a/transport/cloud/s3.py b/transport/cloud/s3.py new file mode 100644 index 0000000..4e230e8 --- /dev/null +++ b/transport/cloud/s3.py @@ -0,0 +1,127 @@ +""" +Data Transport - 1.0 +Steve L. Nyemba, The Phi Technology LLC + +This file is a wrapper around s3 bucket provided by AWS for reading and writing content +""" +from datetime import datetime +import boto +from boto.s3.connection import S3Connection, OrdinaryCallingFormat +import numpy as np +import botocore +from smart_open import smart_open +import sys + +import json +from io import StringIO +import json + +class s3 : + """ + @TODO: Implement a search function for a file given a bucket?? + """ + def __init__(self,**args) : + """ + This function will extract a file or set of files from s3 bucket provided + @param access_key + @param secret_key + @param path location of the file + @param filter filename or filtering elements + """ + try: + self.s3 = S3Connection(args['access_key'],args['secret_key'],calling_format=OrdinaryCallingFormat()) + self.bucket = self.s3.get_bucket(args['bucket'].strip(),validate=False) if 'bucket' in args else None + # self.path = args['path'] + self.filter = args['filter'] if 'filter' in args else None + self.filename = args['file'] if 'file' in args else None + self.bucket_name = args['bucket'] if 'bucket' in args else None + + except Exception as e : + self.s3 = None + self.bucket = None + print (e) + def meta(self,**args): + """ + :name name of the bucket + """ + info = self.list(**args) + [item.open() for item in info] + return [{"name":item.name,"size":item.size} for item in info] + def list(self,**args): + """ + This function will list the content of a bucket, the bucket must be provided by the name + :name name of the bucket + """ + return list(self.s3.get_bucket(args['name']).list()) + + + def buckets(self): + # + # This function will return all buckets, not sure why but it should be used cautiously + # based on why the s3 infrastructure is used + # + return [item.name for item in self.s3.get_all_buckets()] + + # def buckets(self): + pass + # """ + # This function is a wrapper around the bucket list of buckets for s3 + # """ + # return self.s3.get_all_buckets() + + +class Reader(s3) : + """ + Because s3 contains buckets and files, reading becomes a tricky proposition : + - list files if file is None + - stream content if file is Not None + @TODO: support read from all buckets, think about it + """ + def __init__(self,**args) : + s3.__init__(self,**args) + def files(self): + r = [] + try: + return [item.name for item in self.bucket if item.size > 0] + except Exception as e: + pass + return r + def stream(self,limit=-1): + """ + At this point we should stream a file from a given bucket + """ + key = self.bucket.get_key(self.filename.strip()) + if key is None : + yield None + else: + count = 0 + with smart_open(key) as remote_file: + for line in remote_file: + if count == limit and limit > 0 : + break + yield line + count += 1 + def read(self,**args) : + if self.filename is None : + # + # returning the list of files because no one file was specified. + return self.files() + else: + limit = args['size'] if 'size' in args else -1 + return self.stream(limit) + +class Writer(s3) : + + def __init__(self,**args) : + s3.__init__(self,**args) + def mkdir(self,name): + """ + This function will create a folder in a bucket + :name name of the folder + """ + self.s3.put_object(Bucket=self.bucket_name,key=(name+'/')) + def write(self,content): + file = StringIO(content.decode("utf8")) + self.s3.upload_fileobj(file,self.bucket_name,self.filename) + pass + diff --git a/transport/nosql/__init__.py b/transport/nosql/__init__.py new file mode 100644 index 0000000..465b912 --- /dev/null +++ b/transport/nosql/__init__.py @@ -0,0 +1,10 @@ +""" +Steve L. Nyemba, nyemba@gmail.com +This namespace implements support for cloud databases couchdb,mongodb, cloudant ... +""" +from transport.nosql import couchdb +from transport.nosql import mongodb +# from . import mongodb +# from . import couchdb + +cloudant = couchdb \ No newline at end of file diff --git a/transport/nosql/couchdb.py b/transport/nosql/couchdb.py new file mode 100644 index 0000000..aa503fb --- /dev/null +++ b/transport/nosql/couchdb.py @@ -0,0 +1,213 @@ +""" +Data-Transport +Steve L. Nyemba, The Phi Technology + +This file is a wrapper around couchdb using IBM Cloudant SDK that has an interface to couchdb + +""" +import cloudant +import json +import sys +# from transport.common import Reader, Writer +from datetime import datetime + + +class Couch: + """ + This class is a wrapper for read/write against couchdb. The class captures common operations for read/write. + @param url host & port reference default http://localhost:5984 + @param doc user id involved + @param dbname database name (target) + """ + def __init__(self,**args): + url = args['url'] if 'url' in args else 'http://localhost:5984' + self._id = args['doc'] + dbname = args['dbname'] + if 'username' not in args and 'password' not in args : + self.server = cloudant.CouchDB(None,None,url=url) + else: + self.server = cloudant.CouchDB(args['username'],args['password'],url=url) + self.server.connect() + + if dbname in self.server.all_dbs() : + self.dbase = self.server.get(dbname,dbname,True) + # + # @TODO Check if the database exists ... + # + doc = cloudant.document.Document(self.dbase,self._id) #self.dbase.get(self._id) + if not doc.exists(): + doc = self.dbase.create_document({"_id":self._id}) + doc.save() + else: + self.dbase = None + """ + Insuring the preconditions are met for processing + """ + def isready(self): + p = self.server.metadata() != {} + if p == False or not self.dbase: + return False + # + # At this point we are sure that the server is connected + # We are also sure that the database actually exists + # + doc = cloudant.document.Document(self.dbase,self._id) + # q = self.dbase.all_docs(key=self._id)['rows'] + # if not q : + if not doc.exists(): + return False + return True + + def view(self,**args): + """ + The function will execute a view (provivded a user is authenticated) + :id design document _design/xxxx (provide full name with _design prefix) + :view_name name of the view i.e + :key(s) key(s) to be used to filter the content + """ + document = cloudant.design_document.DesignDocument(self.dbase,args['id']) + document.fetch() + params = {'group_level':1,'group':True} + if 'key' in args : + params ['key'] = args['key'] + elif 'keys' in args : + params['keys'] = args['keys'] + return document.get_view(args['view_name'])(**params)['rows'] + + + + +class Reader(Couch): + """ + This function will read an attachment from couchdb and return it to calling code. The attachment must have been placed before hand (otherwise oops) + @T: Account for security & access control + """ + def __init__(self,**args): + """ + @param filename filename (attachment) + """ + # + # setting the basic parameters for + Couch.__init__(self,**args) + if 'filename' in args : + self.filename = args['filename'] + else: + self.filename = None + + + def stream(self): + # + # @TODO Need to get this working ... + # + document = cloudant.document.Document(self.dbase,self._id) + # content = self.dbase.fetch_attachment(self._id,self.filename).split('\n') ; + content = self.get_attachment(self.filename) + for row in content: + yield row + + def read(self,**args): + if self.filename is not None: + self.stream() + else: + return self.basic_read() + def basic_read(self): + document = cloudant.document.Document(self.dbase,self._id) + + # document = self.dbase.get(self._id) + if document.exists() : + document.fetch() + document = dict(document) + del document['_rev'] + else: + document = {} + return document + +class Writer(Couch): + """ + This class will write on a couchdb document provided a scope + The scope is the attribute that will be on the couchdb document + """ + def __init__(self,**args): + """ + @param uri host & port reference + @param uid user id involved + @param filename filename (attachment) + @param dbname database name (target) + """ + + super().__init__(self,**args) + def set (self,info): + document = cloudant.document.Document(self.dbase,self._id) + if document.exists() : + keys = list(set(document.keys()) - set(['_id','_rev','_attachments'])) + for id in keys : + document.field_set(document,id,None) + for id in info : + value = info[id] + document.info(document,id,value) + + document.save() + pass + else: + _document = dict({"_id":self._id},**args) + document.create_document(_document) + def write(self,info): + """ + write a given attribute to a document database + @info object to be written to the to an attribute. this + """ + + # document = self.dbase.get(self._id) + document = cloudant.document.Document(self.dbase,self._id) #.get(self._id) + if document.exists() is False : + document = self.dbase.create_document({"_id":self._id}) + # label = params['label'] + # row = params['row'] + # if label not in document : + # document[label] = [] + # document[label].append(row) + for key in info : + if key in document and type(document[key]) == list : + document[key] += info[key] + else: + document[key] = info[key] + + document.save() + # self.dbase.bulk_docs([document]) + # self.dbase.save_doc(document) + + def upload(self,**args): + """ + :param name name of the file to be uploaded + :param data content of the file (binary or text) + :param content_type (default) + """ + mimetype = args['content_type'] if 'content_type' in args else 'text/plain' + document = cloudant.document.Document(self.dbase,self.uid) + document.put_attachment(self.dbase,args['filename'],mimetype,args['content']) + document.save() + + def archive(self,params=None): + """ + This function will archive the document onto itself. + """ + # document = self.dbase.all_docs(self._id,include_docs=True) + document = cloudant.document.Document(self.dbase,self.filename) + document.fetch() + content = {} + # _doc = {} + for id in document: + if id not in ['_id','_rev','_attachments'] : + content[id] = document[id] + del document[id] + + content = json.dumps(content) + # document= _doc + now = str(datetime.today()) + + name = '-'.join([document['_id'] , now,'.json']) + self.upload(filename=name,data=content,content_type='application/json') + # self.dbase.bulk_docs([document]) + # self.dbase.put_attachment(document,content,name,'application/json') + # document.put_attachment(self.dbase,name,'application/json',content) + # document.save() diff --git a/transport/nosql/mongodb.py b/transport/nosql/mongodb.py new file mode 100644 index 0000000..2784cd2 --- /dev/null +++ b/transport/nosql/mongodb.py @@ -0,0 +1,242 @@ +""" +Data Transport - 1.0 +Steve L. Nyemba, The Phi Technology LLC + +This file is a wrapper around mongodb for reading/writing content against a mongodb server and executing views (mapreduce) +""" +from pymongo import MongoClient +import bson +from bson.objectid import ObjectId +from bson.binary import Binary +# import nujson as json +from datetime import datetime +import pandas as pd +import numpy as np +import gridfs +import sys +import json +import re +from multiprocessing import Lock, RLock +from transport.common import IEncoder + +class Mongo : + lock = RLock() + """ + Basic mongodb functions are captured here + """ + def __init__(self,**args): + """ + :dbname database name/identifier + :host host and port of the database by default localhost:27017 + :username username for authentication + :password password for current user + """ + self.host = 'localhost' if 'host' not in args else args['host'] + self.mechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism'] + # authSource=(args['authSource'] if 'authSource' in args else self.dbname) + self._lock = False if 'lock' not in args else args['lock'] + self.dbname = None + username = password = None + if 'auth_file' in args : + _info = json.loads((open(args['auth_file'])).read()) + + + else: + _info = {} + _args = dict(args,**_info) + _map = {'dbname':'db','database':'db','table':'uid','collection':'uid','col':'uid','doc':'uid'} + for key in _args : + if key in ['username','password'] : + username = _args['username'] if key=='username' else username + password = _args['password'] if key == 'password' else password + continue + value = _args[key] + if key in _map : + key = _map[key] + + self.setattr(key,value) + # + # Let us perform aliasing in order to remain backwards compatible + + self.dbname = self.db if hasattr(self,'db')else self.dbname + self.collection = _args['table'] if 'table' in _args else (_args['doc'] if 'doc' in _args else (_args['collection'] if 'collection' in _args else None)) + if username and password : + self.client = MongoClient(self.host, + username=username, + password=password , + authSource=self.authSource, + authMechanism=self.mechanism) + + else: + self.client = MongoClient(self.host,maxPoolSize=10000) + + self.db = self.client[self.dbname] + + def isready(self): + p = self.dbname in self.client.list_database_names() + q = self.collection in self.client[self.dbname].list_collection_names() + return p and q + def setattr(self,key,value): + _allowed = ['host','port','db','doc','collection','authSource','mechanism'] + if key in _allowed : + setattr(self,key,value) + pass + def close(self): + self.client.close() + def meta(self,**_args): + return [] +class Reader(Mongo): + """ + This class will read from a mongodb data store and return the content of a document (not a collection) + """ + def __init__(self,**args): + Mongo.__init__(self,**args) + def read(self,**args): + + if 'mongo' in args or 'cmd' in args or 'pipeline' in args: + # + # @TODO: + cmd = {} + if 'aggregate' not in cmd and 'aggregate' not in args: + cmd['aggregate'] = self.collection + elif 'aggregate' in args : + cmd['aggregate'] = args['aggregate'] + if 'pipeline' in args : + cmd['pipeline']= args['pipeline'] + + if 'pipeline' not in args or 'aggregate' not in cmd : + cmd = args['mongo'] if 'mongo' in args else args['cmd'] + if "aggregate" in cmd : + if "allowDiskUse" not in cmd : + cmd["allowDiskUse"] = True + if "cursor" not in cmd : + cmd["cursor"] = {} + r = [] + out = self.db.command(cmd) + #@TODO: consider using a yield (generator) works wonders + while True : + if 'values' in out : + r += out['values'] + if 'cursor' in out : + key = 'firstBatch' if 'firstBatch' in out['cursor'] else 'nextBatch' + else: + key = 'n' + if 'cursor' in out and out['cursor'][key] : + r += list(out['cursor'][key]) + elif key in out and out[key]: + r.append (out[key]) + # yield out['cursor'][key] + if key not in ['firstBatch','nextBatch'] or ('cursor' in out and out['cursor']['id'] == 0) : + break + else: + out = self.db.command({"getMore":out['cursor']['id'],"collection":out['cursor']['ns'].split(".")[-1]}) + + + return pd.DataFrame(r) + else: + + + if 'table' in args or 'collection' in args : + if 'table' in args: + _uid = args['table'] + elif 'collection' in args : + _uid = args['collection'] + else: + _uid = self.collection + else: + _uid = self.collection + collection = self.db[_uid] + _filter = args['filter'] if 'filter' in args else {} + _df = pd.DataFrame(collection.find(_filter)) + columns = _df.columns.tolist()[1:] + return _df[columns] + def view(self,**args): + """ + This function is designed to execute a view (map/reduce) operation + """ + pass +class Writer(Mongo): + """ + This class is designed to write to a mongodb collection within a database + """ + def __init__(self,**args): + Mongo.__init__(self,**args) + def upload(self,**args) : + """ + This function will upload a file to the current database (using GridFS) + :param data binary stream/text to be stored + :param filename filename to be used + :param encoding content_encoding (default utf-8) + + """ + if 'encoding' not in args : + args['encoding'] = 'utf-8' + gfs = GridFS(self.db) + gfs.put(**args) + + def archive(self): + """ + This function will archive documents to the + """ + collection = self.db[self.collection] + rows = list(collection.find()) + for row in rows : + if type(row['_id']) == ObjectId : + row['_id'] = str(row['_id']) + stream = Binary(json.dumps(collection,cls=IEncoder).encode()) + collection.delete_many({}) + now = "-".join([str(datetime.now().year()),str(datetime.now().month), str(datetime.now().day)]) + name = ".".join([self.collection,'archive',now])+".json" + description = " ".join([self.collection,'archive',str(len(rows))]) + self.upload(filename=name,data=stream,description=description,content_type='application/json') + # gfs = GridFS(self.db) + # gfs.put(filename=name,description=description,data=stream,encoding='utf-8') + # self.write({{"filename":name,"file":stream,"description":descriptions}}) + + + pass + + def write(self,info,**_args): + """ + This function will write to a given collection i.e add a record to a collection (no updates) + @param info new record in the collection to be added + """ + # document = self.db[self.collection].find() + #collection = self.db[self.collection] + # if type(info) == list : + # self.db[self.collection].insert_many(info) + # else: + try: + if 'table' in _args or 'collection' in _args : + _uid = _args['table'] if 'table' in _args else _args['collection'] + else: + _uid = self.collection if 'doc' not in _args else _args['doc'] + if self._lock : + Mongo.lock.acquire() + if type(info) == list or type(info) == pd.DataFrame : + info if type(info) == list else info.to_dict(orient='records') + info = json.loads(json.dumps(info,cls=IEncoder)) + self.db[_uid].insert_many(info) + else: + self.db[_uid].insert_one(json.loads(json.dumps(info,cls=IEncoder))) + finally: + if self._lock : + Mongo.lock.release() + def set(self,document): + """ + if no identifier is provided the function will delete the entire collection and set the new document. + Please use this function with great care (archive the content first before using it... for safety) + """ + + collection = self.db[self.collection] + if collection.count_document() > 0 and '_id' in document: + id = document['_id'] + del document['_id'] + collection.find_one_and_replace({'_id':id},document) + else: + collection.delete_many({}) + self.write(info) + def close(self): + Mongo.close(self) + # collecton.update_one({"_id":self.collection},document,True) + diff --git a/transport/other/__init__.py b/transport/other/__init__.py new file mode 100644 index 0000000..ea26d80 --- /dev/null +++ b/transport/other/__init__.py @@ -0,0 +1 @@ +from . import files, http, rabbitmq, callback, files \ No newline at end of file diff --git a/transport/other/callback.py b/transport/other/callback.py new file mode 100644 index 0000000..29b03fc --- /dev/null +++ b/transport/other/callback.py @@ -0,0 +1,45 @@ +import queue +from threading import Thread, Lock +from transport.common import Reader,Writer +import numpy as np +import pandas as pd + +class Writer : + lock = Lock() + _queue = {'default':queue.Queue()} + def __init__(self,**_args): + self._cache = {} + self._callback = _args['callback'] if 'callback' in _args else None + self._id = _args['id'] if 'id' in _args else 'default' + if self._id not in Writer._queue : + Writer._queue[self._id] = queue.Queue() + thread = Thread(target=self._forward) + thread.start() + def _forward(self): + _q = Writer._queue[self._id] + _data = _q.get() + _q.task_done() + self._callback(_data) + + def has(self,**_args) : + return self._callback is not None + + + def close(self): + """ + This will empty the queue and have it ready for another operation + """ + _q = Writer._queue[self._id] + with _q.mutex: + _q.queue.clear() + _q.all_tasks_done.notify_all() + + def write(self,_data,**_args): + _id = _args['id'] if 'id' in _args else self._id + + _q = Writer._queue[_id] + _q.put(_data) + _q.join() + + + # self.callback = print \ No newline at end of file diff --git a/transport/other/console.py b/transport/other/console.py new file mode 100644 index 0000000..16f589a --- /dev/null +++ b/transport/other/console.py @@ -0,0 +1,7 @@ +from . import callback + + +class Writer (callback.Writer): + def __init__(self,**_args): + super().__init__(callback=print) + \ No newline at end of file diff --git a/transport/other/files.py b/transport/other/files.py new file mode 100644 index 0000000..a4e8a08 --- /dev/null +++ b/transport/other/files.py @@ -0,0 +1,68 @@ +""" +This file is a wrapper around pandas built-in functionalities to handle character delimited files +""" +import pandas as pd +import numpy as np +import os +class File : + def __init__(self,**params): + """ + + @param path absolute path of the file to be read + """ + self.path = params['path'] if 'path' in params else None + self.delimiter = params['delimiter'] if 'delimiter' in params else ',' + + def isready(self): + return os.path.exists(self.path) + def meta(self,**_args): + return [] + +class Reader (File): + """ + This class is designed to read data from disk (location on hard drive) + @pre : isready() == True + """ + + def __init__(self,**_args): + super().__init__(**_args) + + def read(self,**args): + _path = self.path if 'path' not in args else args['path'] + _delimiter = self.delimiter if 'delimiter' not in args else args['delimiter'] + return pd.read_csv(_path,delimiter=self.delimiter) + def stream(self,**args): + raise Exception ("streaming needs to be implemented") +class Writer (File): + + """ + This function writes output to disk in a designated location. The function will write a text to a text file + - If a delimiter is provided it will use that to generate a xchar-delimited file + - If not then the object will be dumped as is + """ + # THREAD_LOCK = RLock() + def __init__(self,**_args): + super().__init__(**_args) + self._mode = 'w' if 'mode' not in _args else _args['mode'] + + def write(self,info,**_args): + """ + This function writes a record to a designated file + @param label + @param row row to be written + """ + try: + + _delim = self._delimiter if 'delimiter' not in _args else _args['delimiter'] + _path = self._path if 'path' not in _args else _args['path'] + _mode = self._mode if 'mode' not in _args else _args['mode'] + info.to_csv(_path,index=False,sep=_delim) + + pass + except Exception as e: + # + # Not sure what should be done here ... + pass + finally: + # DiskWriter.THREAD_LOCK.release() + pass \ No newline at end of file diff --git a/transport/other/http.py b/transport/other/http.py new file mode 100644 index 0000000..d92e334 --- /dev/null +++ b/transport/other/http.py @@ -0,0 +1,88 @@ +from flask import request, session +from datetime import datetime +import re +# from transport.common import Reader, Writer +import json +import requests +from io import StringIO +import pandas as pd + + +class Reader: + """ + This class is designed to read data from an Http request file handler provided to us by flask + The file will be heald in memory and processed accordingly + NOTE: This is inefficient and can crash a micro-instance (becareful) + """ + + def __init__(self,**_args): + self._url = _args['url'] + self._headers = None if 'headers' not in _args else _args['headers'] + + # def isready(self): + # return self.file_length > 0 + def format(self,_response): + _mimetype= _response.headers['Content-Type'] + if _mimetype == 'text/csv' or 'text/csv': + _content = _response.text + return pd.read_csv(StringIO(_content)) + # + # @TODO: Add support for excel, JSON and other file formats that fit into a data-frame + # + + return _response.text + def read(self,**_args): + if self._headers : + r = requests.get(self._url,headers = self._headers) + else: + r = requests.get(self._url,headers = self._headers) + return self.format(r) + +class Writer: + """ + This class is designed to submit data to an endpoint (url) + """ + def __init__(self,**_args): + """ + @param key required session key + """ + self._url = _args['url'] + self._name = _args['name'] + self._method = 'post' if 'method' not in _args else _args['method'] + + # self.session = params['queue'] + # self.session['sql'] = [] + # self.session['csv'] = [] + # self.tablename = re.sub('..+$','',params['filename']) + # self.session['uid'] = params['uid'] + #self.xchar = params['xchar'] + + + def format_sql(self,row): + values = "','".join([col.replace('"','').replace("'",'') for col in row]) + return "".join(["INSERT INTO :table VALUES('",values,"');\n"]).replace(':table',self.tablename) + def isready(self): + return True + def write(self,_data,**_args): + # + # + _method = self._method if 'method' not in _args else _args['method'] + _method = _method.lower() + _mimetype = 'text/csv' + if type(_data) == dict : + _mimetype = 'application/json' + _content = _data + else: + _content = _data.to_dict(orient='records') + _headers = {'Content-Type':_mimetype} + _pointer = getattr(requests,_method) + + _pointer ({self._name:_content},headers=_headers) + + + # label = params['label'] + # row = params ['row'] + + # if label == 'usable': + # self.session['csv'].append(self.format(row,',')) + # self.session['sql'].append(self.format_sql(row)) diff --git a/transport/other/rabbitmq.py b/transport/other/rabbitmq.py new file mode 100644 index 0000000..f56800d --- /dev/null +++ b/transport/other/rabbitmq.py @@ -0,0 +1,272 @@ +""" +Data Transport - 1.0 +Steve L. Nyemba, The Phi Technology LLC + +This file is a wrapper around rabbitmq server for reading and writing content to a queue (exchange) + +""" +import pika +from datetime import datetime +import re +import json +import os +import sys +# if sys.version_info[0] > 2 : +# from transport.common import Reader, Writer +# else: +# from common import Reader, Writer +import json +from multiprocessing import RLock +class MessageQueue: + """ + This class hierarchy is designed to handle interactions with a queue server using pika framework (our tests are based on rabbitmq) + :host + :xid identifier of the exchange + :qid identifier of the queue + """ + def __init__(self,**params): + self.host= 'localhost' if 'host' not in params else params['host'] #-- location of the queue server + self.port= 5672 if 'port' not in params else params['port'] + self.virtual_host = '/' if 'vhost' not in params else params['vhost'] + self.exchange = params['exchange'] if 'exchange' in params else 'amq.direct' #-- exchange + self.queue = params['queue'] if 'queue' in params else 'demo' + self.connection = None + self.channel = None + + self.name = self.__class__.__name__.lower() if 'name' not in params else params['name'] + + username = password = None + if 'username' in params : + username = params['username'] + password = params['password'] + if 'auth_file' in params : + _info = json.loads((open(params['auth_file'])).read()) + username=_info['username'] + password=_info['password'] + self.virtual_host = _info['virtual_host'] if 'virtual_host' in _info else self.virtual_host + self.exchange = _info['exchange'] if 'exchange' in _info else self.exchange + self.queue = _info['queue'] if 'queue' in _info else self.queue + + self.credentials= pika.PlainCredentials('guest','guest') + if 'username' in params : + self.credentials = pika.PlainCredentials( + params['username'], + ('' if 'password' not in params else params['password']) + ) + + def init(self,label=None): + properties = pika.ConnectionParameters(host=self.host,port=self.port,virtual_host=self.virtual_host, + client_properties={'connection_name':self.name}, + credentials=self.credentials) + self.connection = pika.BlockingConnection(properties) + self.channel = self.connection.channel() + self.info = self.channel.exchange_declare(exchange=self.exchange,exchange_type='direct',durable=True) + if label is None: + self.qhandler = self.channel.queue_declare(queue=self.queue,durable=True) + else: + self.qhandler = self.channel.queue_declare(queue=label,durable=True) + + self.channel.queue_bind(exchange=self.exchange,queue=self.qhandler.method.queue) + + def isready(self): + #self.init() + resp = self.connection is not None and self.connection.is_open + # self.close() + return resp + def finalize(self): + pass + def close(self): + if self.connection.is_closed == False : + self.channel.close() + self.connection.close() + +class Writer(MessageQueue): + """ + This class is designed to publish content to an AMQP (Rabbitmq) + The class will rely on pika to implement this functionality + + We will publish information to a given queue for a given exchange + """ + def __init__(self,**params): + #self.host= params['host'] + #self.exchange = params['uid'] + #self.queue = params['queue'] + MessageQueue.__init__(self,**params); + self.init() + def write(self,data,_type='text/plain'): + """ + This function writes a stream of data to the a given queue + @param object object to be written (will be converted to JSON) + @TODO: make this less chatty + """ + + stream = json.dumps(data) if isinstance(data,dict) else data + self.channel.basic_publish( + exchange=self.exchange, + routing_key=self.queue, + body=stream, + properties=pika.BasicProperties(content_type=_type,delivery_mode=2) + ); + # self.close() + + def flush(self): + self.init() + _mode = 1 #-- Non persistent + self.channel.queue_delete( queue=self.queue); + self.close() + +class Reader(MessageQueue): + """ + This class will read from a queue provided an exchange, queue and host + @TODO: Account for security and virtualhosts + """ + + def __init__(self,**params): + """ + @param host host + @param uid exchange identifier + @param qid queue identifier + """ + + #self.host= params['host'] + #self.exchange = params['uid'] + #self.queue = params['qid'] + MessageQueue.__init__(self,**params); + # self.init() + self.durable = False if 'durable' not in params else params['durable'] + # if 'durable' in params : + # self.durable = True + # else: + # self.durable = False + self.size = -1 + self.data = {} + # def init(self,qid): + + # properties = pika.ConnectionParameters(host=self.host) + # self.connection = pika.BlockingConnection(properties) + # self.channel = self.connection.channel() + # self.channel.exchange_declare(exchange=self.exchange,type='direct',durable=True) + + # self.info = self.channel.queue_declare(queue=qid,durable=True) + + + def callback(self,channel,method,header,stream): + """ + This is the callback function designed to process the data stream from the queue + + """ + + r = [] + # if re.match("^\{|\[",stream) is not None: + if stream.startswith(b'{') or stream.startswith(b'['): + r = json.loads(stream) + else: + + r = stream + + qid = self.qhandler.method.queue + if qid not in self.data : + self.data[qid] = [] + + self.data[qid].append(r) + # + # We stop reading when the all the messages of the queue are staked + # + if self.size == len(self.data[qid]) or len(self.data[qid]) == self.info.method.message_count: + self.close() + + def read(self,**args): + """ + This function will read, the first message from a queue + @TODO: + Implement channel.basic_get in order to retrieve a single message at a time + Have the number of messages retrieved be specified by size (parameter) + """ + r = {} + self.size = -1 if 'size' in args else int(args['size']) + # + # We enabled the reader to be able to read from several queues (sequentially for now) + # The qid parameter will be an array of queues the reader will be reading from + # + if isinstance(self.queue,str) : + self.queue = [self.queue] + + for qid in self.queue: + self.init(qid) + # r[qid] = [] + + if self.qhandler.method.message_count > 0: + + self.channel.basic_consume(queue=qid,on_message_callback=self.callback,auto_ack=False); + self.channel.start_consuming() + else: + + pass + #self.close() + # r[qid].append( self.data) + + return self.data +class QueueListener(MessageQueue): + lock = RLock() + """ + This class is designed to have an active listener (worker) against a specified Exchange/Queue + It is initialized as would any other object and will require a callback function to address the objects returned. + """ + def __init__(self,**args): + MessageQueue.__init__(self,**args) + self.listen = self.read + self.apply = args['apply'] if 'apply' in args else print + self.lock = False if 'lock' not in args else args['lock'] + + def finalize(self,channel,ExceptionReason): + pass + + def callback(self,channel,method,header,stream) : + _info= {} + # if re.match("^\{|\[",stream) is not None: + + + if stream.startswith(b"[") or stream.startswith(b"{"): + _info = json.loads(stream) + else: + + _info = stream + # + # At this point we should invoke the apply function with a lock if need be + # @TODO: Establish a vocabulary + + if stream == b'QUIT' : + # channel.exit() + self.close() + if self.lock == True : + QueueListener.lock.acquire() + try: + # + # In case the user has not specified a function to apply the data against, it will simply be printed + # + self.apply(_info) + except Exception as e: + pass + if self.lock == True : + QueueListener.lock.release() + def read(self): + + self.init(self.queue) + + self.channel.basic_consume(self.queue,self.callback,auto_ack=True); + self.channel.start_consuming() + + + +class Factory : + @staticmethod + def instance(**_args): + """ + :param count number of workers + :param apply function workers + """ + _apply = _args['apply'] + _count = _args['count'] + for i in np.arange(_count) : + _name = _args['name'] if 'name' in _args else 'worker_'+str(i) + transport.factory.instance(provider="rabbit",context="listener",apply=_apply,auth_file=_args['auth_file']) \ No newline at end of file diff --git a/transport/providers.py b/transport/providers.py deleted file mode 100644 index ddb2fcb..0000000 --- a/transport/providers.py +++ /dev/null @@ -1,105 +0,0 @@ -# from transport.common import Reader, Writer,Console #, factory -from transport import disk -import sqlite3 -from transport import s3 as s3 -from transport import rabbitmq as queue -from transport import couch as couch -from transport import mongo as mongo -from transport import sql as sql -from transport import etl as etl -from transport import qlistener -from transport import bricks -from transport import session -from transport import nextcloud -import psycopg2 as pg -import mysql.connector as my -from google.cloud import bigquery as bq -import nzpy as nz #--- netezza drivers -import os - -from info import __version__ - -POSTGRESQL = 'postgresql' -MONGODB = 'mongodb' -HTTP='http' -BIGQUERY ='bigquery' -FILE = 'file' -ETL = 'etl' -SQLITE = 'sqlite' -SQLITE3= 'sqlite' -REDSHIFT = 'redshift' -NETEZZA = 'netezza' -MYSQL = 'mysql+mysqlconnector' -RABBITMQ = 'rabbitmq' -MARIADB = 'mariadb' -COUCHDB = 'couch' -CONSOLE = 'console' -ETL = 'etl' -TRANSPORT = ETL -NEXTCLOUD = 'nextcloud' - -# -# synonyms of the above -BQ = BIGQUERY -MONGO = MONGODB -FERRETDB= MONGODB -PG = POSTGRESQL -PSQL = POSTGRESQL -PGSQL = POSTGRESQL -S3 = 's3' -AWS_S3 = 's3' -RABBIT = RABBITMQ - -QLISTENER = 'qlistener' -QUEUE = QLISTENER -CALLBACK = QLISTENER -DATABRICKS= 'databricks+connector' -DRIVERS = {PG:pg,REDSHIFT:pg,MYSQL:my,MARIADB:my,NETEZZA:nz,SQLITE:sqlite3} -CATEGORIES ={'sql':[NETEZZA,PG,MYSQL,REDSHIFT,SQLITE,MARIADB],'nosql':[MONGODB,COUCHDB],'cloud':[NEXTCLOUD,S3,BIGQUERY,DATABRICKS],'file':[FILE], - 'queue':[RABBIT,QLISTENER],'memory':[CONSOLE,QUEUE],'http':[HTTP]} - -READ = {'sql':sql.SQLReader,'nosql':{MONGODB:mongo.MongoReader,COUCHDB:couch.CouchReader}, - 'cloud':{BIGQUERY:sql.BigQueryReader,DATABRICKS:bricks.BricksReader,NEXTCLOUD:nextcloud.NextcloudReader}, - 'file':disk.DiskReader,'queue':{RABBIT:queue.QueueReader,QLISTENER:qlistener.qListener}, - # 'cli':{CONSOLE:Console},'memory':{CONSOLE:Console},'http':session.HttpReader - } -WRITE = {'sql':sql.SQLWriter,'nosql':{MONGODB:mongo.MongoWriter,COUCHDB:couch.CouchWriter}, - 'cloud':{BIGQUERY:sql.BigQueryWriter,DATABRICKS:bricks.BricksWriter,NEXTCLOUD:nextcloud.NextcloudWriter}, - 'file':disk.DiskWriter,'queue':{RABBIT:queue.QueueWriter,QLISTENER:qlistener.qListener}, - # 'cli':{CONSOLE:Console}, - # 'memory':{CONSOLE:Console}, 'http':session.HttpReader - - } -# SQL_PROVIDERS = [POSTGRESQL,MYSQL,NETEZZA,MARIADB,SQLITE] -PROVIDERS = { - FILE:{'read':disk.DiskReader,'write':disk.DiskWriter}, - SQLITE:{'read':disk.SQLiteReader,'write':disk.SQLiteWriter,'driver':sqlite3}, - 'sqlite3':{'read':disk.SQLiteReader,'write':disk.SQLiteWriter,'driver':sqlite3}, - - POSTGRESQL:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':pg,'default':{'host':'localhost','port':5432}}, - NETEZZA:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':nz,'default':{'port':5480}}, - REDSHIFT:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':pg,'default':{'host':'localhost','port':5432}}, - RABBITMQ:{'read':queue.QueueReader,'writer':queue.QueueWriter,'context':queue.QueueListener,'default':{'host':'localhost','port':5432}}, - - MYSQL:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':my,'default':{'host':'localhost','port':3306}}, - MARIADB:{'read':sql.SQLReader,'write':sql.SQLWriter,'driver':my,'default':{'host':'localhost','port':3306}}, - - S3:{'read':s3.s3Reader,'write':s3.s3Writer}, - BIGQUERY:{'read':sql.BigQueryReader,'write':sql.BigQueryWriter}, - DATABRICKS:{'read':bricks.BricksReader,'write':bricks.BricksWriter}, - NEXTCLOUD:{'read':nextcloud.NextcloudReader,'write':nextcloud.NextcloudWriter}, - - QLISTENER:{'read':qlistener.qListener,'write':qlistener.qListener,'default':{'host':'localhost','port':5672}}, - CONSOLE:{'read':qlistener.Console,"write":qlistener.Console}, - HTTP:{'read':session.HttpReader,'write':session.HttpWriter}, - - MONGODB:{'read':mongo.MongoReader,'write':mongo.MongoWriter,'default':{'port':27017,'host':'localhost'}}, - COUCHDB:{'read':couch.CouchReader,'writer':couch.CouchWriter,'default':{'host':'localhost','port':5984}}, -# ETL :{'read':etl.Transporter,'write':etl.Transporter} - ETL :{'read':etl.instance,'write':etl.instance} -} -DEFAULT = {PG:{'host':'localhost','port':5432},MYSQL:{'host':'localhost','port':3306}} -DEFAULT[MONGODB] = {'port':27017,'host':'localhost'} -DEFAULT[REDSHIFT] = DEFAULT[PG] -DEFAULT[MARIADB] = DEFAULT[MYSQL] -DEFAULT[NETEZZA] = {'port':5480} diff --git a/transport/providers/__init__.py b/transport/providers/__init__.py new file mode 100644 index 0000000..fc0f1e7 --- /dev/null +++ b/transport/providers/__init__.py @@ -0,0 +1,44 @@ +""" +This file is intended to aggregate all we can about the framework in terms of support +""" + +BIGQUERY='bigquery' + +POSTGRESQL = 'postgresql' +MONGODB = 'mongodb' +HTTP='http' +BIGQUERY ='bigquery' +FILE = 'file' +ETL = 'etl' +SQLITE = 'sqlite' +SQLITE3= 'sqlite3' +REDSHIFT = 'redshift' +NETEZZA = 'netezza' +MYSQL = 'mysql' +MARIADB= MYSQL + +COUCHDB = 'couchdb' +CONSOLE = 'console' +ETL = 'etl' +TRANSPORT = ETL +NEXTCLOUD = 'nextcloud' +S3 = 's3' +CALLBACK = 'callback' +CONSOLE = 'console' +RABBITMQ = 'rabbitmq' +DATABRICKS= 'databricks' + +# +# synonyms of the above +BQ = BIGQUERY +MONGO = MONGODB +FERRETDB= MONGODB +PG = POSTGRESQL +PSQL = POSTGRESQL +PGSQL = POSTGRESQL + +AWS_S3 = 's3' +RABBIT = RABBITMQ + +# QLISTENER = 'qlistener' + \ No newline at end of file diff --git a/transport/sql.py b/transport/sql.py deleted file mode 100644 index c5b52d4..0000000 --- a/transport/sql.py +++ /dev/null @@ -1,526 +0,0 @@ -""" -This file is intended to perform read/writes against an SQL database such as PostgreSQL, Redshift, Mysql, MsSQL ... - -LICENSE (MIT) -Copyright 2016-2020, The Phi Technology LLC - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -@TODO: - - Migrate SQLite to SQL hierarchy - - Include Write in Chunks from pandas -""" -import psycopg2 as pg -import mysql.connector as my -import sys - -import sqlalchemy -if sys.version_info[0] > 2 : - from transport.common import Reader, Writer #, factory -else: - from common import Reader,Writer -import json -from google.oauth2 import service_account -from google.cloud import bigquery as bq -# import constants.bq_utils as bq_consts - -from multiprocessing import Lock, RLock -import pandas as pd -import pandas_gbq as pd_gbq -import numpy as np -import nzpy as nz #--- netezza drivers -import sqlite3 -import copy -import os -import time - -class SQLRW : - lock = RLock() - MAX_CHUNK = 2000000 - DRIVERS = {"postgresql":pg,"redshift":pg,"mysql":my,"mariadb":my,"netezza":nz} - REFERENCE = { - "netezza":{"port":5480,"handler":nz,"dtype":"VARCHAR(512)"}, - "postgresql":{"port":5432,"handler":pg,"dtype":"VARCHAR"}, - "redshift":{"port":5432,"handler":pg,"dtype":"VARCHAR"}, - "mysql":{"port":3360,"handler":my,"dtype":"VARCHAR(256)"}, - "mariadb":{"port":3360,"handler":my,"dtype":"VARCHAR(256)"}, - } - def __init__(self,**_args): - - - _info = {} - _info['dbname'] = _args['db'] if 'db' in _args else _args['database'] - self.table = _args['table'] if 'table' in _args else None - self.fields = _args['fields'] if 'fields' in _args else [] - self.schema = _args['schema'] if 'schema' in _args else '' - self._chunks = 1 if 'chunks' not in _args else int(_args['chunks']) - - self._provider = _args['provider'] if 'provider' in _args else None - # _info['host'] = 'localhost' if 'host' not in _args else _args['host'] - # _info['port'] = SQLWriter.REFERENCE[_provider]['port'] if 'port' not in _args else _args['port'] - - _info['host'] = _args['host'] if 'host' in _args else '' - _info['port'] = _args['port'] if 'port' in _args else '' - - # if 'host' in _args : - # _info['host'] = 'localhost' if 'host' not in _args else _args['host'] - # # _info['port'] = SQLWriter.PROVIDERS[_args['provider']] if 'port' not in _args else _args['port'] - # _info['port'] = SQLWriter.REFERENCE[_provider]['port'] if 'port' not in _args else _args['port'] - self.lock = False if 'lock' not in _args else _args['lock'] - if 'username' in _args or 'user' in _args: - key = 'username' if 'username' in _args else 'user' - _info['user'] = _args[key] - _info['password'] = _args['password'] if 'password' in _args else '' - if 'auth_file' in _args : - _auth = json.loads( open(_args['auth_file']).read() ) - key = 'username' if 'username' in _auth else 'user' - _info['user'] = _auth[key] - _info['password'] = _auth['password'] if 'password' in _auth else '' - - _info['host'] = _auth['host'] if 'host' in _auth else _info['host'] - _info['port'] = _auth['port'] if 'port' in _auth else _info['port'] - if 'database' in _auth: - _info['dbname'] = _auth['database'] - self.table = _auth['table'] if 'table' in _auth else self.table - # - # We need to load the drivers here to see what we are dealing with ... - - - # _handler = SQLWriter.REFERENCE[_provider]['handler'] - _handler = _args['driver'] #-- handler to the driver - self._dtype = _args['default']['type'] if 'default' in _args and 'type' in _args['default'] else 'VARCHAR(256)' - # self._provider = _args['provider'] - # self._dtype = SQLWriter.REFERENCE[_provider]['dtype'] if 'dtype' not in _args else _args['dtype'] - # self._provider = _provider - if _handler == nz : - _info['database'] = _info['dbname'] - _info['securityLevel'] = 0 - del _info['dbname'] - if _handler == my : - _info['database'] = _info['dbname'] - del _info['dbname'] - if _handler == sqlite3 : - _info = {'path':_info['dbname'],'isolation_level':'IMMEDIATE'} - if _handler != sqlite3 : - self.conn = _handler.connect(**_info) - else: - self.conn = _handler.connect(_info['path'],isolation_level='IMMEDIATE') - self._engine = _args['sqlalchemy'] if 'sqlalchemy' in _args else None - def meta(self,**_args): - schema = [] - try: - if self._engine : - table = _args['table'] if 'table' in _args else self.table - if sqlalchemy.__version__.startswith('1.') : - _m = sqlalchemy.MetaData(bind=self._engine) - _m.reflect() - else: - - _m = sqlalchemy.MetaData() - _m.reflect(bind=self._engine) - schema = [{"name":_attr.name,"type":str(_attr.type)} for _attr in _m.tables[table].columns] - # - # Some house keeping work - _m = {'BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'} - for _item in schema : - if _item['type'] in _m : - _item['type'] = _m[_item['type']] - - except Exception as e: - print (e) - pass - return schema - def _tablename(self,name) : - - return self.schema +'.'+name if self.schema not in [None, ''] and '.' not in name else name - def has(self,**_args): - return self.meta(**_args) - # found = False - # try: - - # table = self._tablename(_args['table'])if 'table' in _args else self._tablename(self.table) - # sql = "SELECT * FROM :table LIMIT 1".replace(":table",table) - # if self._engine : - # _conn = self._engine.connect() - # else: - # _conn = self.conn - # found = pd.read_sql(sql,_conn).shape[0] - # found = True - - # except Exception as e: - # print (e) - # pass - # finally: - # if not self._engine : - # _conn.close() - # return found - def isready(self): - _sql = "SELECT * FROM :table LIMIT 1".replace(":table",self.table) - try: - _conn = self.conn if not hasattr(self,'_engine') else self._engine - return pd.read_sql(_sql,_conn).columns.tolist() - except Exception as e: - pass - return False - def apply(self,_sql): - """ - This function applies a command and/or a query against the current relational data-store - :param _sql insert/select statement - @TODO: Store procedure calls - """ - # - _out = None - try: - if _sql.lower().startswith('select') : - - _conn = self._engine if self._engine else self.conn - return pd.read_sql(_sql,_conn) - else: - # Executing a command i.e no expected return values ... - cursor = self.conn.cursor() - cursor.execute(_sql) - self.conn.commit() - except Exception as e : - print (e) - finally: - if not self._engine : - self.conn.commit() - # cursor.close() - def close(self): - try: - self.conn.close() - except Exception as error : - print (error) - pass -class SQLReader(SQLRW,Reader) : - def __init__(self,**_args): - super().__init__(**_args) - - def read(self,**_args): - if 'sql' in _args : - _sql = (_args['sql']) - else: - if 'table' in _args : - table = _args['table'] - else: - table = self.table - # table = self.table if self.table is not None else _args['table'] - _sql = "SELECT :fields FROM "+self._tablename(table) - if 'filter' in _args : - _sql = _sql +" WHERE "+_args['filter'] - if 'fields' in _args : - _fields = _args['fields'] - else: - _fields = '*' if not self.fields else ",".join(self.fields) - _sql = _sql.replace(":fields",_fields) - # - # At this point we have a query we can execute gracefully - if 'limit' in _args : - _sql = _sql + " LIMIT "+str(_args['limit']) - # - # @TODO: - # It is here that we should inspect to see if there are any pre/post conditions - # - return self.apply(_sql) - def close(self) : - try: - self.conn.close() - except Exception as error : - print (error) - pass - -class SQLWriter(SQLRW,Writer): - def __init__(self,**_args) : - super().__init__(**_args) - # - # In the advent that data typing is difficult to determine we can inspect and perform a default case - # This slows down the process but improves reliability of the data - # NOTE: Proper data type should be set on the target system if their source is unclear. - - self._cast = False if 'cast' not in _args else _args['cast'] - - def init(self,fields=None): - # if not fields : - # try: - # table = self._tablename(self.table) - # self.fields = pd.read_sql_query("SELECT * FROM :table LIMIT 1".replace(":table",table),self.conn).columns.tolist() - # except Exception as e: - # pass - # finally: - # pass - # else: - self.fields = fields; - - def make(self,**_args): - table = self._tablename(self.table) if 'table' not in _args else self._tablename(_args['table']) - if 'fields' in _args : - fields = _args['fields'] - # table = self._tablename(self.table) - sql = " ".join(["CREATE TABLE",table," (", ",".join([ name +' '+ self._dtype for name in fields]),")"]) - - else: - schema = _args['schema'] if 'schema' in _args else [] - - _map = _args['map'] if 'map' in _args else {} - sql = [] # ["CREATE TABLE ",_args['table'],"("] - for _item in schema : - _type = _item['type'] - if _type in _map : - _type = _map[_type] - sql = sql + [" " .join([_item['name'], ' ',_type])] - sql = ",".join(sql) - # table = self._tablename(_args['table']) - sql = ["CREATE TABLE ",table,"( ",sql," )"] - sql = " ".join(sql) - - cursor = self.conn.cursor() - try: - - cursor.execute(sql) - except Exception as e : - print (e) - # print (sql) - pass - finally: - # cursor.close() - self.conn.commit() - pass - def write(self,info,**_args): - """ - :param info writes a list of data to a given set of fields - """ - # inspect = False if 'inspect' not in _args else _args['inspect'] - # cast = False if 'cast' not in _args else _args['cast'] - # if not self.fields : - # if type(info) == list : - # _fields = info[0].keys() - # elif type(info) == dict : - # _fields = info.keys() - # elif type(info) == pd.DataFrame : - # _fields = info.columns.tolist() - - # # _fields = info.keys() if type(info) == dict else info[0].keys() - # # _fields = list (_fields) - # self.init(_fields) - - try: - table = _args['table'] if 'table' in _args else self.table - # - # In SQL, schema can stand for namespace or the structure of a table - # In case we have a list, we are likely dealing with table structure - # - if 'schema' in _args : - if type(_args['schema']) == str : - self.schema = _args['schema'] if 'schema' in _args else self.schema - elif type(_args['schema']) == list and len(_args['schema']) > 0 and not self.has(table=table): - # - # There is a messed up case when an empty array is passed (no table should be created) - # - self.make(table=table,schema=_args['schema']) - pass - # self.schema = _args['schema'] if 'schema' in _args else self.schema - table = self._tablename(table) - - _sql = "INSERT INTO :table (:fields) VALUES (:values)".replace(":table",table) #.replace(":table",self.table).replace(":fields",_fields) - - if type(info) == list : - _info = pd.DataFrame(info) - elif type(info) == dict : - _info = pd.DataFrame([info]) - else: - _info = pd.DataFrame(info) - - - if _info.shape[0] == 0 : - - return - if self.lock : - SQLRW.lock.acquire() - # - # we will adjust the chunks here in case we are not always sure of the - if self._chunks == 1 and _info.shape[0] > SQLRW.MAX_CHUNK : - self._chunks = 10 - _indexes = np.array_split(np.arange(_info.shape[0]),self._chunks) - for i in _indexes : - # - # In case we have an invalid chunk ... - if _info.iloc[i].shape[0] == 0 : - continue - # - # We are enabling writing by chunks/batches because some persistent layers have quotas or limitations on volume of data - - if self._engine is not None: - # pd.to_sql(_info,self._engine) - if self.schema in ['',None] : - rows = _info.iloc[i].to_sql(table,self._engine,if_exists='append',index=False) - else: - # - # Writing with schema information ... - rows = _info.iloc[i].to_sql(self.table,self._engine,schema=self.schema,if_exists='append',index=False) - time.sleep(1) - else: - _fields = ",".join(self.fields) - _sql = _sql.replace(":fields",_fields) - values = ", ".join("?"*len(self.fields)) if self._provider == 'netezza' else ",".join(["%s" for name in self.fields]) - _sql = _sql.replace(":values",values) - cursor = self.conn.cursor() - cursor.executemany(_sql,_info.iloc[i].values.tolist()) - cursor.close() - # cursor.commit() - - # self.conn.commit() - except Exception as e: - print(e) - pass - finally: - - if self._engine is None : - self.conn.commit() - if self.lock : - SQLRW.lock.release() - # cursor.close() - pass - def close(self): - try: - self.conn.close() - finally: - pass -class BigQuery: - def __init__(self,**_args): - path = _args['service_key'] if 'service_key' in _args else _args['private_key'] - self.credentials = service_account.Credentials.from_service_account_file(path) - self.dataset = _args['dataset'] if 'dataset' in _args else None - self.path = path - self.dtypes = _args['dtypes'] if 'dtypes' in _args else None - self.table = _args['table'] if 'table' in _args else None - self.client = bq.Client.from_service_account_json(self.path) - def meta(self,**_args): - """ - This function returns meta data for a given table or query with dataset/table properly formatted - :param table name of the name WITHOUT including dataset - :param sql sql query to be pulled, - """ - table = _args['table'] if 'table' in _args else self.table - - try: - if table : - _dataset = self.dataset if 'dataset' not in _args else _args['dataset'] - sql = f"""SELECT column_name as name, data_type as type FROM {_dataset}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{table}' """ - _info = {'credentials':self.credentials,'dialect':'standard'} - return pd_gbq.read_gbq(sql,**_info).to_dict(orient='records') - # return self.read(sql=sql).to_dict(orient='records') - # ref = self.client.dataset(self.dataset).table(table) - - # _schema = self.client.get_table(ref).schema - # return [{"name":_item.name,"type":_item.field_type,"description":( "" if not hasattr(_item,"description") else _item.description )} for _item in _schema] - else : - return [] - except Exception as e: - - return [] - def has(self,**_args): - found = False - try: - _has = self.meta(**_args) - found = _has is not None and len(_has) > 0 - except Exception as e: - pass - return found -class BQReader(BigQuery,Reader) : - def __init__(self,**_args): - - super().__init__(**_args) - def apply(self,sql): - return self.read(sql=sql) - - def read(self,**_args): - SQL = None - table = self.table if 'table' not in _args else _args['table'] - if 'sql' in _args : - SQL = _args['sql'] - elif table: - - table = "".join(["`",table,"`"]) if '.' in table else "".join(["`:dataset.",table,"`"]) - SQL = "SELECT * FROM :table ".replace(":table",table) - if not SQL : - return None - if SQL and 'limit' in _args: - SQL += " LIMIT "+str(_args['limit']) - if (':dataset' in SQL or ':DATASET' in SQL) and self.dataset: - SQL = SQL.replace(':dataset',self.dataset).replace(':DATASET',self.dataset) - _info = {'credentials':self.credentials,'dialect':'standard'} - return pd_gbq.read_gbq(SQL,**_info) if SQL else None - # return self.client.query(SQL).to_dataframe() if SQL else None - - -class BQWriter(BigQuery,Writer): - lock = Lock() - def __init__(self,**_args): - super().__init__(**_args) - - self.parallel = False if 'lock' not in _args else _args['lock'] - self.table = _args['table'] if 'table' in _args else None - self.mode = {'if_exists':'append','chunksize':900000,'destination_table':self.table,'credentials':self.credentials} - self._chunks = 1 if 'chunks' not in _args else int(_args['chunks']) - self._location = 'US' if 'location' not in _args else _args['location'] - def write(self,_info,**_args) : - try: - if self.parallel or 'lock' in _args : - BQWriter.lock.acquire() - _args['table'] = self.table if 'table' not in _args else _args['table'] - self._write(_info,**_args) - finally: - if self.parallel: - BQWriter.lock.release() - def submit(self,_sql): - """ - Write the output of a massive query to a given table, biquery will handle this as a job - This function will return the job identifier - """ - _config = bq.QueryJobConfig() - _config.destination = self.client.dataset(self.dataset).table(self.table) - _config.allow_large_results = True - # _config.write_disposition = bq.bq_consts.WRITE_APPEND - _config.dry_run = False - # _config.priority = 'BATCH' - _resp = self.client.query(_sql,location=self._location,job_config=_config) - return _resp.job_id - def status (self,_id): - return self.client.get_job(_id,location=self._location) - def _write(self,_info,**_args) : - _df = None - if type(_info) in [list,pd.DataFrame] : - if type(_info) == list : - _df = pd.DataFrame(_info) - elif type(_info) == pd.DataFrame : - _df = _info - - if '.' not in _args['table'] : - self.mode['destination_table'] = '.'.join([self.dataset,_args['table']]) - else: - - self.mode['destination_table'] = _args['table'].strip() - if 'schema' in _args : - self.mode['table_schema'] = _args['schema'] - # - # Let us insure that the types are somewhat compatible ... - # _map = {'INTEGER':np.int64,'DATETIME':'datetime64[ns]','TIMESTAMP':'datetime64[ns]','FLOAT':np.float64,'DOUBLE':np.float64,'STRING':str} - # _mode = copy.deepcopy(self.mode) - _mode = self.mode - # _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) - # - # Let us adjust the chunking here - self._chunks = 10 if _df.shape[0] > SQLRW.MAX_CHUNK and self._chunks == 1 else self._chunks - _indexes = np.array_split(np.arange(_df.shape[0]),self._chunks) - for i in _indexes : - _df.iloc[i].to_gbq(**self.mode) - time.sleep(1) - pass -# -# Aliasing the big query classes allowing it to be backward compatible -# -BigQueryReader = BQReader -BigQueryWriter = BQWriter \ No newline at end of file diff --git a/transport/sql/__init__.py b/transport/sql/__init__.py new file mode 100644 index 0000000..557d36d --- /dev/null +++ b/transport/sql/__init__.py @@ -0,0 +1,18 @@ +""" +This namespace/package wrap the sql functionalities for a certain data-stores + - netezza, postgresql, mysql and sqlite + - mariadb, redshift (also included) +""" +from . import postgresql, mysql, netezza, sqlite + + +# +# Creating aliases for support of additional data-store providerss +# +mariadb = mysql +redshift = postgresql +sqlite3 = sqlite + + +# from transport import sql + diff --git a/transport/sql/common.py b/transport/sql/common.py new file mode 100644 index 0000000..89dcefb --- /dev/null +++ b/transport/sql/common.py @@ -0,0 +1,125 @@ +""" +This file encapsulates common operations associated with SQL databases via SQLAlchemy + +""" +import sqlalchemy as sqa +import pandas as pd + +class Base: + def __init__(self,**_args): + self._host = _args['host'] if 'host' in _args else 'localhost' + self._port = None + self._database = _args['database'] + self._table = _args['table'] if 'table' in _args else None + self._engine= sqa.create_engine(self._get_uri(**_args),future=True) + def _set_uri(self,**_args) : + """ + :provider provider + :host host and port + :account account user/pwd + """ + _account = _args['account'] if 'account' in _args else None + _host = _args['host'] + _provider = _args['provider'].replace(':','').replace('/','').strip() + def _get_uri(self,**_args): + """ + This function will return the formatted uri for the sqlAlchemy engine + """ + raise Exception ("Function Needs to be implemented ") + def meta (self,**_args): + """ + This function returns the schema (table definition) of a given table + :table optional name of the table (can be fully qualified) + """ + _table = self._table if 'table' not in _args else _args['table'] + _schema = [] + if _table : + if sqa.__version__.startswith('1.') : + _handler = sqa.MetaData(bind=self._engine) + _handler.reflect() + else: + # + # sqlalchemy's version 2.+ + _handler = sqa.MetaData() + _handler.reflect(bind=self._engine) + # + # Let us extract the schema with the native types + _map = {'BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'} + _schema = [{"name":_attr.name,"type":_map.get(str(_attr.type),str(_attr.type))} for _attr in _handler.tables[_table].columns] + return _schema + def has(self,**_args): + return self.meta(**_args) + def apply(self,sql): + """ + Executing sql statement that returns query results (hence the restriction on sql and/or with) + :sql SQL query to be exectued + + @TODO: Execution of stored procedures + """ + return pd.read_sql(sql,self._engine) if sql.lower().startswith('select') or sql.lower().startswith('with') else None + +class SQLBase(Base): + def __init__(self,**_args): + super().__init__(**_args) + def get_provider(self): + raise Exception ("Provider Needs to be set ...") + def get_default_port(self) : + raise Exception ("default port needs to be set") + + def _get_uri(self,**_args): + _host = self._host + _account = '' + if self._port : + _port = self._port + else: + _port = self.get_default_port() + + _host = f'{_host}:{_port}' + + if 'username' in _args : + _account = ''.join([_args['username'],':',_args['password'],'@']) + _database = self._database + _provider = self.get_provider().replace(':','').replace('/','') + # _uri = [f'{_provider}:/',_account,_host,_database] + # _uri = [_item.strip() for _item in _uri if _item.strip()] + # return '/'.join(_uri) + return f'{_provider}://{_host}/{_database}' if _account == '' else f'{_provider}://{_account}{_host}/{_database}' + +class BaseReader(SQLBase): + def __init__(self,**_args): + super().__init__(**_args) + def read(self,**_args): + """ + This function will read a query or table from the specific database + """ + if 'sql' in _args : + sql = _args['sql'] + else: + _table = _args['table'] if 'table' in _args else self._table + sql = f'SELECT * FROM {_table}' + return self.apply(sql) + + +class BaseWriter (SQLBase): + """ + This class implements SQLAlchemy support for Writting to a data-store (RDBMS) + """ + def __init__(self,**_args): + super().__init__(**_args) + def write(self,_data,**_args): + if type(_data) == dict : + _df = pd.DataFrame(_data) + elif type(_data) == list : + _df = pd.DataFrame(_data) + else: + _df = _data.copy() + # + # We are assuming we have a data-frame at this point + # + _table = _args['table'] if 'table' in _args else self._table + _mode = {'chunksize':2000000,'if_exists':'append','index':False} + if 'schema' in _args : + _mode['schema'] = _args['schema'] + if 'if_exists' in _args : + _mode['if_exists'] = _args['if_exists'] + _df.to_sql(_table,self._engine,**_args,index=False) \ No newline at end of file diff --git a/transport/sql/mysql.py b/transport/sql/mysql.py new file mode 100644 index 0000000..320eb68 --- /dev/null +++ b/transport/sql/mysql.py @@ -0,0 +1,18 @@ +""" +This file implements support for mysql and maria db (with drivers mysql+mysql) +""" +from transport.sql.common import BaseReader, BaseWriter +# import mysql.connector as my +class MYSQL: + + def get_provider(self): + return "mysql+mysqlconnector" + def get_default_port(self): + return "3306" +class Reader(MYSQL,BaseReader) : + def __init__(self,**_args): + super().__init__(**_args) + +class Writer(MYSQL,BaseWriter) : + def __init__(self,**_args): + super().__init__(**_args) \ No newline at end of file diff --git a/transport/sql/netezza.py b/transport/sql/netezza.py new file mode 100644 index 0000000..6d53164 --- /dev/null +++ b/transport/sql/netezza.py @@ -0,0 +1,15 @@ +import nzpy as nz +from transport.sql.common import BaseReader, BaseWriter + +class Netezza: + def get_provider(self): + return 'netezza+nzpy' + def get_default_port(self): + return '5480' + +class Reader(Netezza,BaseReader) : + def __init__(self,**_args): + super().__init__(**_args) +class Writer(Netezza,BaseWriter): + def __init__(self,**_args): + super().__init__(**_args) \ No newline at end of file diff --git a/transport/sql/postgresql.py b/transport/sql/postgresql.py new file mode 100644 index 0000000..0831291 --- /dev/null +++ b/transport/sql/postgresql.py @@ -0,0 +1,22 @@ + +from transport.sql.common import BaseReader , BaseWriter +from psycopg2.extensions import register_adapter, AsIs +import numpy as np + +register_adapter(np.int64, AsIs) + +class PG: + def __init__(self,**_args): + super().__init__(**_args) + def get_provider(self): + return "postgresql" + + def get_default_port(self): + return "5432" +class Reader(PG,BaseReader) : + def __init__(self,**_args): + super().__init__(**_args) +class Writer(PG,BaseWriter): + def __init__(self,**_args): + super().__init__(**_args) + diff --git a/transport/sql/sqlite.py b/transport/sql/sqlite.py new file mode 100644 index 0000000..734ab24 --- /dev/null +++ b/transport/sql/sqlite.py @@ -0,0 +1,25 @@ +import sqlalchemy +import pandas as pd +from transport.sql.common import Base, BaseReader, BaseWriter +class SQLite (BaseReader): + def __init__(self,**_args): + super().__init__(**_args) + if 'path' in _args : + self._database = _args['path'] + if 'database' in _args : + self._database = _args['database'] + def _get_uri(self,**_args): + path = self._database + return f'sqlite:///{path}' # ensure this is the correct path for the sqlite file. + +class Reader(SQLite,BaseReader): + def __init__(self,**_args): + super().__init__(**_args) + # def read(self,**_args): + # sql = _args['sql'] + # return pd.read_sql(sql,self._engine) + + +class Writer (SQLite,BaseWriter): + def __init__(self,**_args): + super().__init__(**_args) \ No newline at end of file From 6feae101b0097ef28183cfa4df0d087ce333f449 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 12:52:06 -0500 Subject: [PATCH 194/271] bug fixes: and support for plugins --- transport/__init__.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/transport/__init__.py b/transport/__init__.py index 288f646..387161d 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -17,12 +17,14 @@ Source Code is available under MIT License: https://hiplab.mc.vanderbilt.edu/git/hiplab/data-transport """ import numpy as np + from transport import sql, nosql, cloud, other import pandas as pd import json import os from info import __version__,__author__ - +from transport.iowrapper import IWriter, IReader +from transport.plugins import PluginLoader PROVIDERS = {} def init(): global PROVIDERS @@ -31,7 +33,6 @@ def init(): if _provider_name.startswith('__') : continue PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__} -# print ([ {name:getattr(sql,name)} for name in dir(sql) if not name.startswith('__')]) def instance (**_args): """ @@ -55,9 +56,23 @@ def instance (**_args): _context = _args['context'] else: _context = 'read' - _pointer = getattr(_module,'Reader') if _context == 'read' else getattr(_module,'Writer') - return _pointer (**_args) - pass + _pointer = getattr(_module,'Reader') if _context == 'read' else getattr(_module,'Writer') + _agent = _pointer (**_args) + # + loader = None + if 'plugins' in _args : + _params = _args['plugins'] + + if 'path' in _params and 'names' in _params : + loader = PluginLoader(**_params) + elif type(_params) == list: + loader = PluginLoader() + for _delegate in _params : + loader.set(_delegate) + + + return IReader(_agent,loader) if _context == 'read' else IWriter(_agent,loader) + else: raise Exception ("Missing or Unknown provider") pass @@ -79,11 +94,3 @@ class factory : pass factory.instance = instance init() -# if __name__ == '__main__' : -# # if not PROVIDERS : -# init() -# print (list(PROVIDERS.keys())) -# pgr = instance(provider='postgresql',database='io',table='foo',write=True) -# print (pgr.read()) -# print () -# print (supported()) \ No newline at end of file From fd899f554985cecdfd5da04e34decd33063f1bcb Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 12:52:42 -0500 Subject: [PATCH 195/271] adding wrapper class/design pattern to support plugins --- transport/iowrapper.py | 47 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 transport/iowrapper.py diff --git a/transport/iowrapper.py b/transport/iowrapper.py new file mode 100644 index 0000000..f113d85 --- /dev/null +++ b/transport/iowrapper.py @@ -0,0 +1,47 @@ +""" +This class is a wrapper around read/write classes of cloud,sql,nosql,other packages +The wrapper allows for application of plugins as pre-post conditions +""" +class IO: + """ + Base wrapper class for read/write + """ + def __init__(self,_agent,loader): + self._agent = _agent + self._loader = loader + def meta (self,**_args): + if hasattr(self._agent,'meta') : + return self._agent.meta(**_args) + return [] + + def close(self): + if hasattr(self._agent,'close') : + self._agent.close() + def apply(self): + """ + applying pre/post conditions given a pipeline expression + """ + for _pointer in self._loader : + _data = _pointer(_data) + def apply(self,_query): + if hasattr(self._agent,'apply') : + return self._agent.apply(_query) + return None +class IReader(IO): + def __init__(self,_agent,pipeline=None): + super().__init__(_agent,pipeline) + def read(self,**_args): + _data = self._agent.read(**_args) + if self._loader and self._loader.ratio() > 0 : + _data = self._loader.apply(_data) + # + # output data + return _data +class IWriter(IO): + def __init__(self,_agent,pipeline=None): + super().__init__(_agent,pipeline) + def write(self,_data,**_args): + if self._loader and self._loader.ratio() > 0 : + _data = self._loader.apply(_data) + + self._agent.write(_data,**_args) From b160d0a295ed19deef01d885b1e93c8774b7897e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 13:27:14 -0500 Subject: [PATCH 196/271] housekeeping work --- transport/couch.py | 234 --------------------------------------- transport/mongo.py | 241 ----------------------------------------- transport/nextcloud.py | 80 -------------- transport/qlistener.py | 47 -------- transport/session.py | 88 --------------- 5 files changed, 690 deletions(-) delete mode 100644 transport/couch.py delete mode 100644 transport/mongo.py delete mode 100644 transport/nextcloud.py delete mode 100644 transport/qlistener.py delete mode 100644 transport/session.py diff --git a/transport/couch.py b/transport/couch.py deleted file mode 100644 index 8e02a4e..0000000 --- a/transport/couch.py +++ /dev/null @@ -1,234 +0,0 @@ -""" -Data-Transport -Steve L. Nyemba, The Phi Technology - -This file is a wrapper around couchdb using IBM Cloudant SDK that has an interface to couchdb - -""" -import cloudant -import json -import sys -if sys.version_info[0] > 2 : - from transport.common import Reader, Writer -else: - from common import Reader, Writer -class Couch: - """ - This class is a wrapper for read/write against couchdb. The class captures common operations for read/write. - @param url host & port reference default http://localhost:5984 - @param doc user id involved - @param dbname database name (target) - """ - def __init__(self,**args): - url = args['url'] if 'url' in args else 'http://localhost:5984' - self._id = args['doc'] - dbname = args['dbname'] - if 'username' not in args and 'password' not in args : - self.server = cloudant.CouchDB(None,None,url=url) - else: - self.server = cloudant.CouchDB(args['username'],args['password'],url=url) - self.server.connect() - - if dbname in self.server.all_dbs() : - self.dbase = self.server.get(dbname,dbname,True) - # - # @TODO Check if the database exists ... - # - doc = cloudant.document.Document(self.dbase,self._id) #self.dbase.get(self._id) - if not doc.exists(): - doc = self.dbase.create_document({"_id":self._id}) - doc.save() - else: - self.dbase = None - """ - Insuring the preconditions are met for processing - """ - def isready(self): - p = self.server.metadata() != {} - if p == False or not self.dbase: - return False - # - # At this point we are sure that the server is connected - # We are also sure that the database actually exists - # - doc = cloudant.document.Document(self.dbase,self._id) - # q = self.dbase.all_docs(key=self._id)['rows'] - # if not q : - if not doc.exists(): - return False - return True - - def view(self,**args): - """ - The function will execute a view (provivded a user is authenticated) - :id design document _design/xxxx (provide full name with _design prefix) - :view_name name of the view i.e - :key(s) key(s) to be used to filter the content - """ - document = cloudant.design_document.DesignDocument(self.dbase,args['id']) - document.fetch() - params = {'group_level':1,'group':True} - if 'key' in args : - params ['key'] = args['key'] - elif 'keys' in args : - params['keys'] = args['keys'] - return document.get_view(args['view_name'])(**params)['rows'] - - - - -class CouchReader(Couch,Reader): - """ - This function will read an attachment from couchdb and return it to calling code. The attachment must have been placed before hand (otherwise oops) - @T: Account for security & access control - """ - def __init__(self,**args): - """ - @param filename filename (attachment) - """ - # - # setting the basic parameters for - Couch.__init__(self,**args) - if 'filename' in args : - self.filename = args['filename'] - else: - self.filename = None - - # def isready(self): - # # - # # Is the basic information about the database valid - # # - # p = Couchdb.isready(self) - - # if p == False: - # return False - # # - # # The database name is set and correct at this point - # # We insure the document of the given user has the requested attachment. - # # - - # doc = self.dbase.get(self._id) - - # if '_attachments' in doc: - # r = self.filename in doc['_attachments'].keys() - - # else: - # r = False - - # return r - def stream(self): - # - # @TODO Need to get this working ... - # - document = cloudant.document.Document(self.dbase,self._id) - # content = self.dbase.fetch_attachment(self._id,self.filename).split('\n') ; - content = self.get_attachment(self.filename) - for row in content: - yield row - - def read(self,**args): - if self.filename is not None: - self.stream() - else: - return self.basic_read() - def basic_read(self): - document = cloudant.document.Document(self.dbase,self._id) - - # document = self.dbase.get(self._id) - if document.exists() : - document.fetch() - document = dict(document) - del document['_rev'] - else: - document = {} - return document - -class CouchWriter(Couch,Writer): - """ - This class will write on a couchdb document provided a scope - The scope is the attribute that will be on the couchdb document - """ - def __init__(self,**args): - """ - @param uri host & port reference - @param uid user id involved - @param filename filename (attachment) - @param dbname database name (target) - """ - - Couch.__init__(self,**args) - def set (self,info): - document = cloudand.document.Document(self.dbase,self._id) - if document.exists() : - keys = list(set(document.keys()) - set(['_id','_rev','_attachments'])) - for id in keys : - document.field_set(document,id,None) - for id in args : - value = args[id] - document.field_set(document,id,value) - - document.save() - pass - else: - _document = dict({"_id":self._id},**args) - document.create_document(_document) - def write(self,info): - """ - write a given attribute to a document database - @info object to be written to the to an attribute. this - """ - - # document = self.dbase.get(self._id) - document = cloudant.document.Document(self.dbase,self._id) #.get(self._id) - if document.exists() is False : - document = self.dbase.create_document({"_id":self._id}) - # label = params['label'] - # row = params['row'] - # if label not in document : - # document[label] = [] - # document[label].append(row) - for key in info : - if key in document and type(document[key]) == list : - document[key] += info[key] - else: - document[key] = info[key] - - document.save() - # self.dbase.bulk_docs([document]) - # self.dbase.save_doc(document) - - def upload(self,**args): - """ - :param name name of the file to be uploaded - :param data content of the file (binary or text) - :param content_type (default) - """ - mimetype = args['content_type'] if 'content_type' in args else 'text/plain' - document = cloudant.document.Document(self.dbase,self.uid) - document.put_attachment(self.dbase,args['filename'],mimetype,args['content']) - document.save() - - def archive(self,params=None): - """ - This function will archive the document onto itself. - """ - # document = self.dbase.all_docs(self._id,include_docs=True) - document = cloudant.document.Document(self.dbase,self.filename) - document.fetch() - content = {} - # _doc = {} - for id in document: - if id not in ['_id','_rev','_attachments'] : - content[id] = document[id] - del document[id] - - content = json.dumps(content) - # document= _doc - now = str(datetime.today()) - - name = '-'.join([document['_id'] , now,'.json']) - self.upload(filename=name,data=content,content_type='application/json') - # self.dbase.bulk_docs([document]) - # self.dbase.put_attachment(document,content,name,'application/json') - # document.put_attachment(self.dbase,name,'application/json',content) - # document.save() diff --git a/transport/mongo.py b/transport/mongo.py deleted file mode 100644 index c7b5ed8..0000000 --- a/transport/mongo.py +++ /dev/null @@ -1,241 +0,0 @@ -""" -Data Transport - 1.0 -Steve L. Nyemba, The Phi Technology LLC - -This file is a wrapper around mongodb for reading/writing content against a mongodb server and executing views (mapreduce) -""" -from pymongo import MongoClient -from bson.objectid import ObjectId -from bson.binary import Binary -# import nujson as json -from datetime import datetime -import pandas as pd -import numpy as np -import gridfs -# from transport import Reader,Writer -import sys -if sys.version_info[0] > 2 : - from transport.common import Reader, Writer, IEncoder -else: - from common import Reader, Writer -import json -import re -from multiprocessing import Lock, RLock -class Mongo : - lock = RLock() - """ - Basic mongodb functions are captured here - """ - def __init__(self,**args): - """ - :dbname database name/identifier - :host host and port of the database by default localhost:27017 - :username username for authentication - :password password for current user - """ - - self.mechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism'] - # authSource=(args['authSource'] if 'authSource' in args else self.dbname) - self._lock = False if 'lock' not in args else args['lock'] - self.dbname = None - username = password = None - if 'auth_file' in args : - _info = json.loads((open(args['auth_file'])).read()) - - - else: - _info = {} - _args = dict(args,**_info) - _map = {'dbname':'db','database':'db','table':'uid','collection':'uid','col':'uid','doc':'uid'} - for key in _args : - if key in ['username','password'] : - username = _args['username'] if key=='username' else username - password = _args['password'] if key == 'password' else password - continue - value = _args[key] - if key in _map : - key = _map[key] - - self.setattr(key,value) - # - # Let us perform aliasing in order to remain backwards compatible - - self.dbname = self.db if hasattr(self,'db')else self.dbname - self.uid = _args['table'] if 'table' in _args else (_args['doc'] if 'doc' in _args else (_args['collection'] if 'collection' in _args else None)) - if username and password : - self.client = MongoClient(self.host, - username=username, - password=password , - authSource=self.authSource, - authMechanism=self.mechanism) - - else: - self.client = MongoClient(self.host,maxPoolSize=10000) - - self.db = self.client[self.dbname] - - def isready(self): - p = self.dbname in self.client.list_database_names() - q = self.uid in self.client[self.dbname].list_collection_names() - return p and q - def setattr(self,key,value): - _allowed = ['host','port','db','doc','collection','authSource','mechanism'] - if key in _allowed : - setattr(self,key,value) - pass - def close(self): - self.client.close() - def meta(self,**_args): - return [] -class MongoReader(Mongo,Reader): - """ - This class will read from a mongodb data store and return the content of a document (not a collection) - """ - def __init__(self,**args): - Mongo.__init__(self,**args) - def read(self,**args): - - if 'mongo' in args or 'cmd' in args or 'pipeline' in args: - # - # @TODO: - cmd = {} - if 'aggregate' not in cmd and 'aggregate' not in args: - cmd['aggregate'] = self.uid - elif 'aggregate' in args : - cmd['aggregate'] = args['aggregate'] - if 'pipeline' in args : - cmd['pipeline']= args['pipeline'] - - if 'pipeline' not in args or 'aggregate' not in cmd : - cmd = args['mongo'] if 'mongo' in args else args['cmd'] - if "aggregate" in cmd : - if "allowDiskUse" not in cmd : - cmd["allowDiskUse"] = True - if "cursor" not in cmd : - cmd["cursor"] = {} - r = [] - out = self.db.command(cmd) - #@TODO: consider using a yield (generator) works wonders - while True : - if 'values' in out : - r += out['values'] - if 'cursor' in out : - key = 'firstBatch' if 'firstBatch' in out['cursor'] else 'nextBatch' - else: - key = 'n' - if 'cursor' in out and out['cursor'][key] : - r += list(out['cursor'][key]) - elif key in out and out[key]: - r.append (out[key]) - # yield out['cursor'][key] - if key not in ['firstBatch','nextBatch'] or ('cursor' in out and out['cursor']['id'] == 0) : - break - else: - out = self.db.command({"getMore":out['cursor']['id'],"collection":out['cursor']['ns'].split(".")[-1]}) - - - return pd.DataFrame(r) - else: - - - if 'table' in args or 'collection' in args : - if 'table' in args: - _uid = args['table'] - elif 'collection' in args : - _uid = args['collection'] - else: - _uid = self.uid - else: - _uid = self.uid - collection = self.db[_uid] - _filter = args['filter'] if 'filter' in args else {} - _df = pd.DataFrame(collection.find(_filter)) - columns = _df.columns.tolist()[1:] - return _df[columns] - def view(self,**args): - """ - This function is designed to execute a view (map/reduce) operation - """ - pass -class MongoWriter(Mongo,Writer): - """ - This class is designed to write to a mongodb collection within a database - """ - def __init__(self,**args): - Mongo.__init__(self,**args) - def upload(self,**args) : - """ - This function will upload a file to the current database (using GridFS) - :param data binary stream/text to be stored - :param filename filename to be used - :param encoding content_encoding (default utf-8) - - """ - if 'encoding' not in args : - args['encoding'] = 'utf-8' - gfs = GridFS(self.db) - gfs.put(**args) - - def archive(self): - """ - This function will archive documents to the - """ - collection = self.db[self.uid] - rows = list(collection.find()) - for row in rows : - if type(row['_id']) == ObjectId : - row['_id'] = str(row['_id']) - stream = Binary(json.dumps(collection,cls=IEncoder).encode()) - collection.delete_many({}) - now = "-".join([str(datetime.now().year()),str(datetime.now().month), str(datetime.now().day)]) - name = ".".join([self.uid,'archive',now])+".json" - description = " ".join([self.uid,'archive',str(len(rows))]) - self.upload(filename=name,data=stream,description=description,content_type='application/json') - # gfs = GridFS(self.db) - # gfs.put(filename=name,description=description,data=stream,encoding='utf-8') - # self.write({{"filename":name,"file":stream,"description":descriptions}}) - - - pass - def write(self,info,**_args): - """ - This function will write to a given collection i.e add a record to a collection (no updates) - @param info new record in the collection to be added - """ - # document = self.db[self.uid].find() - #collection = self.db[self.uid] - # if type(info) == list : - # self.db[self.uid].insert_many(info) - # else: - try: - if 'table' in _args or 'collection' in _args : - _uid = _args['table'] if 'table' in _args else _args['collection'] - else: - _uid = self.uid if 'doc' not in _args else _args['doc'] - if self._lock : - Mongo.lock.acquire() - if type(info) == list or type(info) == pd.DataFrame : - self.db[_uid].insert_many(info if type(info) == list else info.to_dict(orient='records')) - else: - self.db[_uid].insert_one(info) - finally: - if self._lock : - Mongo.lock.release() - def set(self,document): - """ - if no identifier is provided the function will delete the entire collection and set the new document. - Please use this function with great care (archive the content first before using it... for safety) - """ - - collection = self.db[self.uid] - if collection.count_document() > 0 and '_id' in document: - id = document['_id'] - del document['_id'] - collection.find_one_and_replace({'_id':id},document) - else: - collection.delete_many({}) - self.write(info) - def close(self): - Mongo.close(self) - # collecton.update_one({"_id":self.uid},document,True) - diff --git a/transport/nextcloud.py b/transport/nextcloud.py deleted file mode 100644 index 2eefd51..0000000 --- a/transport/nextcloud.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -We are implementing transport to and from nextcloud (just like s3) -""" -import os -import sys -from transport.common import Reader,Writer, IEncoder -import pandas as pd -from io import StringIO -import json -import nextcloud_client as nextcloud - -class Nextcloud : - def __init__(self,**_args): - pass - self._delimiter = None - self._handler = nextcloud.Client(_args['url']) - _uid = _args['uid'] - _token = _args['token'] - self._uri = _args['folder'] if 'folder' in _args else './' - if self._uri.endswith('/') : - self._uri = self._uri[:-1] - self._file = None if 'file' not in _args else _args['file'] - self._handler.login(_uid,_token) - def close(self): - try: - self._handler.logout() - except Exception as e: - pass - - -class NextcloudReader(Nextcloud,Reader): - def __init__(self,**_args): - # self._file = [] if 'file' not in _args else _args['file'] - super().__init__(**_args) - pass - def read(self,**_args): - _filename = self._file if 'file' not in _args else _args['file'] - # - # @TODO: if _filename is none, an exception should be raised - # - _uri = '/'.join([self._uri,_filename]) - if self._handler.get_file(_uri) : - # - # - _info = self._handler.file_info(_uri) - _content = self._handler.get_file_contents(_uri).decode('utf8') - if _info.get_content_type() == 'text/csv' : - # - # @TODO: enable handling of csv, xls, parquet, pickles - _file = StringIO(_content) - return pd.read_csv(_file) - else: - # - # if it is neither a structured document like csv, we will return the content as is - return _content - return None -class NextcloudWriter (Nextcloud,Writer): - """ - This class will write data to an instance of nextcloud - """ - def __init__(self,**_args) : - super().__init__(**_args) - self - def write(self,_data,**_args): - """ - This function will upload a file to a given destination - :file has the uri of the location of the file - """ - _filename = self._file if 'file' not in _args else _args['file'] - _uri = '/'.join([self._uri,_filename]) - if type(_data) == pd.DataFrame : - f = StringIO() - _data.to_csv(f,index=False) - _content = f.getvalue() - elif type(_data) == dict : - _content = json.dumps(_data,cls=IEncoder) - else: - _content = str(_data) - self._handler.put_file_contents(_uri,_content) - diff --git a/transport/qlistener.py b/transport/qlistener.py deleted file mode 100644 index 26f0ba8..0000000 --- a/transport/qlistener.py +++ /dev/null @@ -1,47 +0,0 @@ -import queue -from threading import Thread, Lock -from transport.common import Reader,Writer -import numpy as np -import pandas as pd - -class qListener : - lock = Lock() - _queue = {'default':queue.Queue()} - def __init__(self,**_args): - self._cache = {} - self._callback = _args['callback'] if 'callback' in _args else None - self._id = _args['id'] if 'id' in _args else 'default' - if self._id not in qListener._queue : - qListener._queue[self._id] = queue.Queue() - thread = Thread(target=self._forward) - thread.start() - def _forward(self): - _q = qListener._queue[self._id] - _data = _q.get() - _q.task_done() - self._callback(_data) - - def has(self,**_args) : - return self._callback is not None - - - def close(self): - """ - This will empty the queue and have it ready for another operation - """ - _q = qListener._queue[self._id] - with _q.mutex: - _q.queue.clear() - _q.all_tasks_done.notify_all() - - def write(self,_data,**_args): - _id = _args['id'] if 'id' in _args else self._id - - _q = qListener._queue[_id] - _q.put(_data) - _q.join() -class Console (qListener): - def __init__(self,**_args): - super().__init__(callback=print) - - # self.callback = print \ No newline at end of file diff --git a/transport/session.py b/transport/session.py deleted file mode 100644 index d74669a..0000000 --- a/transport/session.py +++ /dev/null @@ -1,88 +0,0 @@ -from flask import request, session -from datetime import datetime -import re -from transport.common import Reader, Writer -import json -import requests -from io import StringIO -import pandas as pd - - -class HttpReader(Reader): - """ - This class is designed to read data from an Http request file handler provided to us by flask - The file will be heald in memory and processed accordingly - NOTE: This is inefficient and can crash a micro-instance (becareful) - """ - - def __init__(self,**_args): - self._url = _args['url'] - self._headers = None if 'headers' not in _args else _args['headers'] - - # def isready(self): - # return self.file_length > 0 - def format(self,_response): - _mimetype= _response.headers['Content-Type'] - if _mimetype == 'text/csv' or 'text/csv': - _content = _response.text - return pd.read_csv(StringIO(_content)) - # - # @TODO: Add support for excel, JSON and other file formats that fit into a data-frame - # - - return _response.text - def read(self,**_args): - if self._headers : - r = requests.get(self._url,headers = self._headers) - else: - r = requests.get(self._url,headers = self._headers) - return self.format(r) - -class HttpWriter(Writer): - """ - This class is designed to submit data to an endpoint (url) - """ - def __init__(self,**_args): - """ - @param key required session key - """ - self._url = _args['url'] - self._name = _args['name'] - self._method = 'post' if 'method' not in _args else _args['method'] - - # self.session = params['queue'] - # self.session['sql'] = [] - # self.session['csv'] = [] - # self.tablename = re.sub('..+$','',params['filename']) - # self.session['uid'] = params['uid'] - #self.xchar = params['xchar'] - - - def format_sql(self,row): - values = "','".join([col.replace('"','').replace("'",'') for col in row]) - return "".join(["INSERT INTO :table VALUES('",values,"');\n"]).replace(':table',self.tablename) - def isready(self): - return True - def write(self,_data,**_args): - # - # - _method = self._method if 'method' not in _args else _args['method'] - _method = _method.lower() - _mimetype = 'text/csv' - if type(_data) == dict : - _mimetype = 'application/json' - _content = _data - else: - _content = _data.to_dict(orient='records') - _headers = {'Content-Type':_mimetype} - _pointer = getattr(requests,_method) - - _pointer ({self._name:_content},headers=_headers) - - - # label = params['label'] - # row = params ['row'] - - # if label == 'usable': - # self.session['csv'].append(self.format(row,',')) - # self.session['sql'].append(self.format_sql(row)) From 0cf56f3e8f1c4b1909a1ce2f5f88f086bda7bce5 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 13:28:13 -0500 Subject: [PATCH 197/271] refactoring V2.0 --- transport/bricks.py | 111 ----------------- transport/common.py | 151 ----------------------- transport/disk.py | 269 ---------------------------------------- transport/rabbitmq.py | 279 ------------------------------------------ transport/s3.py | 130 -------------------- 5 files changed, 940 deletions(-) delete mode 100644 transport/bricks.py delete mode 100644 transport/common.py delete mode 100644 transport/disk.py delete mode 100644 transport/rabbitmq.py delete mode 100644 transport/s3.py diff --git a/transport/bricks.py b/transport/bricks.py deleted file mode 100644 index 0aa4383..0000000 --- a/transport/bricks.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -This file implements databricks handling, This functionality will rely on databricks-sql-connector -LICENSE (MIT) -Copyright 2016-2020, The Phi Technology LLC - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - - -@TODO: - - Migrate SQLite to SQL hierarchy - - Include Write in Chunks from pandas -""" -import os -import sqlalchemy -from transport.common import Reader,Writer -import pandas as pd - - -class Bricks: - """ - :host - :token - :database - :cluster_path - :table - """ - def __init__(self,**_args): - _host = _args['host'] - _token= _args['token'] - _cluster_path = _args['cluster_path'] - self._schema = _args['schema'] if 'schema' in _args else _args['database'] - _catalog = _args['catalog'] - self._table = _args['table'] if 'table' in _args else None - - # - # @TODO: - # Sometimes when the cluster isn't up and running it takes a while, the user should be alerted of this - # - - _uri = f'''databricks://token:{_token}@{_host}?http_path={_cluster_path}&catalog={_catalog}&schema={self._schema}''' - self._engine = sqlalchemy.create_engine (_uri) - pass - def meta(self,**_args): - table = _args['table'] if 'table' in _args else self._table - if not table : - return [] - else: - if sqlalchemy.__version__.startswith('1.') : - _m = sqlalchemy.MetaData(bind=self._engine) - _m.reflect(only=[table]) - else: - _m = sqlalchemy.MetaData() - _m.reflect(bind=self._engine) - # - # Let's retrieve te information associated with a table - # - return [{'name':_attr.name,'type':_attr.type} for _attr in _m.tables[table].columns] - - def has(self,**_args): - return self.meta(**_args) - def apply(self,_sql): - try: - if _sql.lower().startswith('select') : - return pd.read_sql(_sql,self._engine) - except Exception as e: - pass - -class BricksReader(Bricks,Reader): - """ - This class is designed for reads and will execute reads against a table name or a select SQL statement - """ - def __init__(self,**_args): - super().__init__(**_args) - def read(self,**_args): - limit = None if 'limit' not in _args else str(_args['limit']) - - if 'sql' in _args : - sql = _args['sql'] - elif 'table' in _args : - table = _args['table'] - sql = f'SELECT * FROM {table}' - if limit : - sql = sql + f' LIMIT {limit}' - - if 'sql' in _args or 'table' in _args : - return self.apply(sql) - else: - return pd.DataFrame() - pass -class BricksWriter(Bricks,Writer): - def __init__(self,**_args): - super().__init__(**_args) - def write(self,_data,**_args): - """ - This data will write data to data-bricks against a given table. If the table is not specified upon initiazation, it can be specified here - _data: data frame to push to databricks - _args: chunks, table, schema - """ - _schema = self._schema if 'schema' not in _args else _args['schema'] - _table = self._table if 'table' not in _args else _args['table'] - _df = _data if type(_data) == pd.DataFrame else _data - if type(_df) == dict : - _df = [_df] - if type(_df) == list : - _df = pd.DataFrame(_df) - _df.to_sql( - name=_table,schema=_schema, - con=self._engine,if_exists='append',index=False); - pass diff --git a/transport/common.py b/transport/common.py deleted file mode 100644 index 8b9f718..0000000 --- a/transport/common.py +++ /dev/null @@ -1,151 +0,0 @@ -""" -Data Transport - 1.0 -Steve L. Nyemba, The Phi Technology LLC - -This module is designed to serve as a wrapper to a set of supported data stores : - - couchdb - - mongodb - - Files (character delimited) - - Queues (Rabbmitmq) - - Session (Flask) - - s3 -The supported operations are read/write and providing meta data to the calling code -Requirements : - pymongo - boto - couldant -@TODO: - Enable read/writing to multiple reads/writes -""" -__author__ = 'The Phi Technology' -import numpy as np -import json -import importlib -from multiprocessing import RLock -import queue -# import couch -# import mongo -from datetime import datetime - -class IO: - def init(self,**args): - """ - This function enables attributes to be changed at runtime. Only the attributes defined in the class can be changed - Adding attributes will require sub-classing otherwise we may have an unpredictable class ... - """ - allowed = list(vars(self).keys()) - for field in args : - if field not in allowed : - continue - value = args[field] - setattr(self,field,value) -class IEncoder (json.JSONEncoder): - def default (self,object): - if type(object) == np.integer : - return int(object) - elif type(object) == np.floating: - return float(object) - elif type(object) == np.ndarray : - return object.tolist() - elif type(object) == datetime : - return object.isoformat() - else: - return super(IEncoder,self).default(object) - -class Reader (IO): - """ - This class is an abstraction of a read functionalities of a data store - """ - def __init__(self): - pass - def meta(self,**_args): - """ - This function is intended to return meta-data associated with what has just been read - @return object of meta data information associated with the content of the store - """ - raise Exception ("meta function needs to be implemented") - def read(self,**args): - """ - This function is intended to read the content of a store provided parameters to be used at the discretion of the subclass - """ - raise Exception ("read function needs to be implemented") - - -class Writer(IO): - def __init__(self): - self.cache = {"default":[]} - def log(self,**args): - self.cache[id] = args - def meta (self,id="default",**args): - raise Exception ("meta function needs to be implemented") - def format(self,row,xchar): - if xchar is not None and isinstance(row,list): - return xchar.join(row)+'\n' - elif xchar is None and isinstance(row,dict): - row = json.dumps(row) - return row - def write(self,**args): - """ - This function will write content to a store given parameters to be used at the discretion of the sub-class - """ - raise Exception ("write function needs to be implemented") - - def archive(self): - """ - It is important to be able to archive data so as to insure that growth is controlled - Nothing in nature grows indefinitely neither should data being handled. - """ - raise Exception ("archive function needs to be implemented") - def close(self): - """ - This function will close the persistent storage connection/handler - """ - pass -class ReadWriter(Reader,Writer) : - """ - This class implements the read/write functions aggregated - """ - pass -# class Console(Writer): -# lock = RLock() -# def __init__(self,**_args): -# self.lock = _args['lock'] if 'lock' in _args else False -# self.info = self.write -# self.debug = self.write -# self.log = self.write -# pass -# def write (self,logs=None,**_args): -# if self.lock : -# Console.lock.acquire() -# try: -# _params = _args if logs is None and _args else logs -# if type(_params) == list: -# for row in _params : -# print (row) -# else: -# print (_params) -# except Exception as e : -# print (e) -# finally: -# if self.lock : -# Console.lock.release() - - -""" -@NOTE : Experimental !! -""" -class Proxy : - """ - This class will forward a call to a function that is provided by the user code - """ - def __init__(self,**_args): - self.callback = _args['callback'] - def read(self,**_args) : - try: - return self.callback(**_args) - except Exception as e: - return self.callback() - - pass - def write(self,data,**_args): - self.callback(data,**_args) diff --git a/transport/disk.py b/transport/disk.py deleted file mode 100644 index 5e43b69..0000000 --- a/transport/disk.py +++ /dev/null @@ -1,269 +0,0 @@ -import os -import sys - - -if sys.version_info[0] > 2 : - from transport.common import Reader, Writer #, factory -else: - from common import Reader,Writer -# import nujson as json -import json -# from threading import Lock -import sqlite3 -import pandas as pd -from multiprocessing import Lock -from transport.common import Reader, Writer, IEncoder -import sqlalchemy -from sqlalchemy import create_engine -class DiskReader(Reader) : - """ - This class is designed to read data from disk (location on hard drive) - @pre : isready() == True - """ - - def __init__(self,**params): - """ - - @param path absolute path of the file to be read - """ - - Reader.__init__(self) - self.path = params['path'] if 'path' in params else None - self.delimiter = params['delimiter'] if 'delimiter' in params else ',' - - def isready(self): - return os.path.exists(self.path) - def meta(self,**_args): - return [] - def read(self,**args): - _path = self.path if 'path' not in args else args['path'] - _delimiter = self.delimiter if 'delimiter' not in args else args['delimiter'] - return pd.read_csv(_path,delimiter=self.delimiter) - def stream(self,**args): - """ - This function reads the rows from a designated location on disk - @param size number of rows to be read, -1 suggests all rows - """ - - size = -1 if 'size' not in args else int(args['size']) - f = open(self.path,'rU') - i = 1 - for row in f: - - i += 1 - if size == i: - break - if self.delimiter : - yield row.split(self.delimiter) - yield row - f.close() -class DiskWriter(Writer): - - """ - This function writes output to disk in a designated location. The function will write a text to a text file - - If a delimiter is provided it will use that to generate a xchar-delimited file - - If not then the object will be dumped as is - """ - THREAD_LOCK = Lock() - def __init__(self,**params): - super().__init__() - self._path = params['path'] - self._delimiter = params['delimiter'] if 'delimiter' in params else None - self._mode = 'w' if 'mode' not in params else params['mode'] - # def meta(self): - # return self.cache['meta'] - # def isready(self): - # """ - # This function determines if the class is ready for execution or not - # i.e it determines if the preconditions of met prior execution - # """ - # return True - # # p = self.path is not None and os.path.exists(self.path) - # # q = self.name is not None - # # return p and q - # def format (self,row): - # self.cache['meta']['cols'] += len(row) if isinstance(row,list) else len(row.keys()) - # self.cache['meta']['rows'] += 1 - # return (self.delimiter.join(row) if self.delimiter else json.dumps(row))+"\n" - def write(self,info,**_args): - """ - This function writes a record to a designated file - @param label - @param row row to be written - """ - try: - - - DiskWriter.THREAD_LOCK.acquire() - - _delim = self._delimiter if 'delimiter' not in _args else _args['delimiter'] - _path = self._path if 'path' not in _args else _args['path'] - _mode = self._mode if 'mode' not in _args else _args['mode'] - info.to_csv(_path,index=False,sep=_delim) - pass - except Exception as e: - # - # Not sure what should be done here ... - pass - finally: - DiskWriter.THREAD_LOCK.release() -class SQLite : - def __init__(self,**_args) : - self.path = _args['database'] if 'database' in _args else _args['path'] - self.conn = sqlite3.connect(self.path,isolation_level="IMMEDIATE") - self.conn.row_factory = sqlite3.Row - self.fields = _args['fields'] if 'fields' in _args else [] - def has (self,**_args): - found = False - try: - if 'table' in _args : - table = _args['table'] - sql = "SELECT * FROM :table limit 1".replace(":table",table) - _df = pd.read_sql(sql,self.conn) - found = _df.columns.size > 0 - except Exception as e: - pass - return found - def close(self): - try: - self.conn.close() - except Exception as e : - print(e) - def apply(self,sql): - try: - if not sql.lower().startswith('select'): - cursor = self.conn.cursor() - cursor.execute(sql) - cursor.close() - self.conn.commit() - else: - return pd.read_sql(sql,self.conn) - except Exception as e: - print (e) -class SQLiteReader (SQLite,DiskReader): - def __init__(self,**args): - super().__init__(**args) - # DiskReader.__init__(self,**args) - # self.path = args['database'] if 'database' in args else args['path'] - # self.conn = sqlite3.connect(self.path,isolation_level=None) - # self.conn.row_factory = sqlite3.Row - self.table = args['table'] if 'table' in args else None - def read(self,**args): - if 'sql' in args : - sql = args['sql'] - elif 'filter' in args : - sql = "SELECT :fields FROM ",self.table, "WHERE (:filter)".replace(":filter",args['filter']) - sql = sql.replace(":fields",args['fields']) if 'fields' in args else sql.replace(":fields","*") - else: - sql = ' '.join(['SELECT * FROM ',self.table]) - if 'limit' in args : - sql = sql + " LIMIT "+args['limit'] - return pd.read_sql(sql,self.conn) - def close(self): - try: - self.conn.close() - except Exception as e : - pass - -class SQLiteWriter(SQLite,DiskWriter) : - connection = None - LOCK = Lock() - def __init__(self,**args): - """ - :path - :fields json|csv - """ - # DiskWriter.__init__(self,**args) - super().__init__(**args) - self.table = args['table'] if 'table' in args else None - path = self.path - self._engine = create_engine(f'sqlite:///{path}') - - # self.conn = sqlite3.connect(self.path,isolation_level="IMMEDIATE") - # self.conn.row_factory = sqlite3.Row - # self.fields = args['fields'] if 'fields' in args else [] - - if self.fields and not self.isready() and self.table: - self.init(self.fields) - SQLiteWriter.connection = self.conn - def init(self,fields): - self.fields = fields; - sql = " ".join(["CREATE TABLE IF NOT EXISTS ",self.table," (", ",".join(self.fields),")"]) - - cursor = self.conn.cursor() - cursor.execute(sql) - cursor.close() - self.conn.commit() - def isready(self): - try: - sql = "SELECT count(*) FROM sqlite_master where name=':table'" - sql = sql.replace(":table",self.table) - cursor = self.conn.cursor() - - r = cursor.execute(sql) - r = r.fetchall() - cursor.close() - - return r[0][0] != 0 - except Exception as e: - pass - return 0 - # - # If the table doesn't exist we should create it - # - # def write(self,_data,**_args): - # SQLiteWriter.LOCK.acquire() - # try: - # if type(_data) == dict : - # _data = [_data] - # _table = self.table if 'table' not in _args else _args['table'] - # _df = pd.DataFrame(_data) - # _df.to_sql(_table,self._engine.connect(),if_exists='append',index=False) - # except Exception as e: - # print (e) - # SQLiteWriter.LOCK.release() - def write(self,info,**_args): - """ - """ - - #if not self.fields : - # #if type(info) == pd.DataFrame : - # # _columns = list(info.columns) - # #self.init(list(info.keys())) - - if type(info) == dict : - info = [info] - elif type(info) == pd.DataFrame : - info = info.fillna('') - info = info.to_dict(orient='records') - - if not self.fields : - _rec = info[0] - self.init(list(_rec.keys())) - - SQLiteWriter.LOCK.acquire() - try: - - cursor = self.conn.cursor() - sql = " " .join(["INSERT INTO ",self.table,"(", ",".join(self.fields) ,")", "values(:values)"]) - for row in info : - values = [ str(row[field]) if type(row[field]) not in [list,dict] else json.dumps(row[field],cls=IEncoder) for field in self.fields] - values = ["".join(["'",value,"'"]) for value in values] - - # stream =["".join(["",value,""]) if type(value) == str else value for value in row.values()] - # stream = json.dumps(stream,cls=IEncoder) - # stream = stream.replace("[","").replace("]","") - - # print (sql.replace(":values",stream)) - # self.conn.execute(sql.replace(":values",stream) ) - self.conn.execute(sql.replace(":values", ",".join(values)) ) - # cursor.commit() - - self.conn.commit() - # print (sql) - except Exception as e : - print () - - print (e) - pass - SQLiteWriter.LOCK.release() diff --git a/transport/rabbitmq.py b/transport/rabbitmq.py deleted file mode 100644 index a56393b..0000000 --- a/transport/rabbitmq.py +++ /dev/null @@ -1,279 +0,0 @@ -""" -Data Transport - 1.0 -Steve L. Nyemba, The Phi Technology LLC - -This file is a wrapper around rabbitmq server for reading and writing content to a queue (exchange) - -""" -import pika -from datetime import datetime -import re -import json -import os -import sys -if sys.version_info[0] > 2 : - from transport.common import Reader, Writer -else: - from common import Reader, Writer -import json -from multiprocessing import RLock -class MessageQueue: - """ - This class hierarchy is designed to handle interactions with a queue server using pika framework (our tests are based on rabbitmq) - :host - :xid identifier of the exchange - :qid identifier of the queue - """ - def __init__(self,**params): - self.host= 'localhost' if 'host' not in params else params['host'] #-- location of the queue server - self.port= 5672 if 'port' not in params else params['port'] - self.virtual_host = '/' if 'vhost' not in params else params['vhost'] - self.exchange = params['exchange'] if 'exchange' in params else 'amq.direct' #-- exchange - self.queue = params['queue'] if 'queue' in params else 'demo' - self.connection = None - self.channel = None - - self.name = self.__class__.__name__.lower() if 'name' not in params else params['name'] - - username = password = None - if 'username' in params : - username = params['username'] - password = params['password'] - if 'auth_file' in params : - _info = json.loads((open(params['auth_file'])).read()) - username=_info['username'] - password=_info['password'] - self.virtual_host = _info['virtual_host'] if 'virtual_host' in _info else self.virtual_host - self.exchange = _info['exchange'] if 'exchange' in _info else self.exchange - self.queue = _info['queue'] if 'queue' in _info else self.queue - - self.credentials= pika.PlainCredentials('guest','guest') - if 'username' in params : - self.credentials = pika.PlainCredentials( - params['username'], - ('' if 'password' not in params else params['password']) - ) - - def init(self,label=None): - properties = pika.ConnectionParameters(host=self.host,port=self.port,virtual_host=self.virtual_host, - client_properties={'connection_name':self.name}, - credentials=self.credentials) - self.connection = pika.BlockingConnection(properties) - self.channel = self.connection.channel() - self.info = self.channel.exchange_declare(exchange=self.exchange,exchange_type='direct',durable=True) - if label is None: - self.qhandler = self.channel.queue_declare(queue=self.queue,durable=True) - else: - self.qhandler = self.channel.queue_declare(queue=label,durable=True) - - self.channel.queue_bind(exchange=self.exchange,queue=self.qhandler.method.queue) - - def isready(self): - #self.init() - resp = self.connection is not None and self.connection.is_open - # self.close() - return resp - def finalize(self): - pass - def close(self): - if self.connection.is_closed == False : - self.channel.close() - self.connection.close() - -class QueueWriter(MessageQueue,Writer): - """ - This class is designed to publish content to an AMQP (Rabbitmq) - The class will rely on pika to implement this functionality - - We will publish information to a given queue for a given exchange - """ - def __init__(self,**params): - #self.host= params['host'] - #self.exchange = params['uid'] - #self.queue = params['queue'] - MessageQueue.__init__(self,**params); - self.init() - - - - - - - - def write(self,data,_type='text/plain'): - """ - This function writes a stream of data to the a given queue - @param object object to be written (will be converted to JSON) - @TODO: make this less chatty - """ - - stream = json.dumps(data) if isinstance(data,dict) else data - self.channel.basic_publish( - exchange=self.exchange, - routing_key=self.queue, - body=stream, - properties=pika.BasicProperties(content_type=_type,delivery_mode=2) - ); - # self.close() - - def flush(self): - self.init() - _mode = 1 #-- Non persistent - self.channel.queue_delete( queue=self.queue); - self.close() - -class QueueReader(MessageQueue,Reader): - """ - This class will read from a queue provided an exchange, queue and host - @TODO: Account for security and virtualhosts - """ - - def __init__(self,**params): - """ - @param host host - @param uid exchange identifier - @param qid queue identifier - """ - - #self.host= params['host'] - #self.exchange = params['uid'] - #self.queue = params['qid'] - MessageQueue.__init__(self,**params); - # self.init() - self.durable = False if 'durable' not in params else params['durable'] - # if 'durable' in params : - # self.durable = True - # else: - # self.durable = False - self.size = -1 - self.data = {} - # def init(self,qid): - - # properties = pika.ConnectionParameters(host=self.host) - # self.connection = pika.BlockingConnection(properties) - # self.channel = self.connection.channel() - # self.channel.exchange_declare(exchange=self.exchange,type='direct',durable=True) - - # self.info = self.channel.queue_declare(queue=qid,durable=True) - - - def callback(self,channel,method,header,stream): - """ - This is the callback function designed to process the data stream from the queue - - """ - - r = [] - # if re.match("^\{|\[",stream) is not None: - if stream.startswith(b'{') or stream.startswith(b'['): - r = json.loads(stream) - else: - - r = stream - - qid = self.qhandler.method.queue - if qid not in self.data : - self.data[qid] = [] - - self.data[qid].append(r) - # - # We stop reading when the all the messages of the queue are staked - # - if self.size == len(self.data[qid]) or len(self.data[qid]) == self.info.method.message_count: - self.close() - - def read(self,**args): - """ - This function will read, the first message from a queue - @TODO: - Implement channel.basic_get in order to retrieve a single message at a time - Have the number of messages retrieved be specified by size (parameter) - """ - r = {} - self.size = -1 if 'size' in args else int(args['size']) - # - # We enabled the reader to be able to read from several queues (sequentially for now) - # The qid parameter will be an array of queues the reader will be reading from - # - if isinstance(self.queue,str) : - self.queue = [self.queue] - - for qid in self.queue: - self.init(qid) - # r[qid] = [] - - if self.qhandler.method.message_count > 0: - - self.channel.basic_consume(queue=qid,on_message_callback=self.callback,auto_ack=False); - self.channel.start_consuming() - else: - - pass - #self.close() - # r[qid].append( self.data) - - return self.data -class QueueListener(MessageQueue): - lock = RLock() - """ - This class is designed to have an active listener (worker) against a specified Exchange/Queue - It is initialized as would any other object and will require a callback function to address the objects returned. - """ - def __init__(self,**args): - MessageQueue.__init__(self,**args) - self.listen = self.read - self.apply = args['apply'] if 'apply' in args else print - self.lock = False if 'lock' not in args else args['lock'] - - def finalize(self,channel,ExceptionReason): - pass - - def callback(self,channel,method,header,stream) : - _info= {} - # if re.match("^\{|\[",stream) is not None: - - - if stream.startswith(b"[") or stream.startswith(b"{"): - _info = json.loads(stream) - else: - - _info = stream - # - # At this point we should invoke the apply function with a lock if need be - # @TODO: Establish a vocabulary - - if stream == b'QUIT' : - # channel.exit() - self.close() - if self.lock == True : - QueueListener.lock.acquire() - try: - # - # In case the user has not specified a function to apply the data against, it will simply be printed - # - self.apply(_info) - except Exception as e: - pass - if self.lock == True : - QueueListener.lock.release() - def read(self): - - self.init(self.queue) - - self.channel.basic_consume(self.queue,self.callback,auto_ack=True); - self.channel.start_consuming() - - - -class Factory : - @staticmethod - def instance(**_args): - """ - :param count number of workers - :param apply function workers - """ - _apply = _args['apply'] - _count = _args['count'] - for i in np.arange(_count) : - _name = _args['name'] if 'name' in _args else 'worker_'+str(i) - transport.factory.instance(provider="rabbit",context="listener",apply=_apply,auth_file=_args['auth_file']) \ No newline at end of file diff --git a/transport/s3.py b/transport/s3.py deleted file mode 100644 index 339cb5c..0000000 --- a/transport/s3.py +++ /dev/null @@ -1,130 +0,0 @@ -""" -Data Transport - 1.0 -Steve L. Nyemba, The Phi Technology LLC - -This file is a wrapper around s3 bucket provided by AWS for reading and writing content -""" -from datetime import datetime -import boto -from boto.s3.connection import S3Connection, OrdinaryCallingFormat -import numpy as np -import botocore -from smart_open import smart_open -import sys -if sys.version_info[0] > 2 : - from transport.common import Reader, Writer -else: - from common import Reader, Writer -import json -from io import StringIO -import json - -class s3 : - """ - @TODO: Implement a search function for a file given a bucket?? - """ - def __init__(self,**args) : - """ - This function will extract a file or set of files from s3 bucket provided - @param access_key - @param secret_key - @param path location of the file - @param filter filename or filtering elements - """ - try: - self.s3 = S3Connection(args['access_key'],args['secret_key'],calling_format=OrdinaryCallingFormat()) - self.bucket = self.s3.get_bucket(args['bucket'].strip(),validate=False) if 'bucket' in args else None - # self.path = args['path'] - self.filter = args['filter'] if 'filter' in args else None - self.filename = args['file'] if 'file' in args else None - self.bucket_name = args['bucket'] if 'bucket' in args else None - - except Exception as e : - self.s3 = None - self.bucket = None - print (e) - def meta(self,**args): - """ - :name name of the bucket - """ - info = self.list(**args) - [item.open() for item in info] - return [{"name":item.name,"size":item.size} for item in info] - def list(self,**args): - """ - This function will list the content of a bucket, the bucket must be provided by the name - :name name of the bucket - """ - return list(self.s3.get_bucket(args['name']).list()) - - - def buckets(self): - # - # This function will return all buckets, not sure why but it should be used cautiously - # based on why the s3 infrastructure is used - # - return [item.name for item in self.s3.get_all_buckets()] - - # def buckets(self): - pass - # """ - # This function is a wrapper around the bucket list of buckets for s3 - # """ - # return self.s3.get_all_buckets() - - -class s3Reader(s3,Reader) : - """ - Because s3 contains buckets and files, reading becomes a tricky proposition : - - list files if file is None - - stream content if file is Not None - @TODO: support read from all buckets, think about it - """ - def __init__(self,**args) : - s3.__init__(self,**args) - def files(self): - r = [] - try: - return [item.name for item in self.bucket if item.size > 0] - except Exception as e: - pass - return r - def stream(self,limit=-1): - """ - At this point we should stream a file from a given bucket - """ - key = self.bucket.get_key(self.filename.strip()) - if key is None : - yield None - else: - count = 0 - with smart_open(key) as remote_file: - for line in remote_file: - if count == limit and limit > 0 : - break - yield line - count += 1 - def read(self,**args) : - if self.filename is None : - # - # returning the list of files because no one file was specified. - return self.files() - else: - limit = args['size'] if 'size' in args else -1 - return self.stream(limit) - -class s3Writer(s3,Writer) : - - def __init__(self,**args) : - s3.__init__(self,**args) - def mkdir(self,name): - """ - This function will create a folder in a bucket - :name name of the folder - """ - self.s3.put_object(Bucket=self.bucket_name,key=(name+'/')) - def write(self,content): - file = StringIO(content.decode("utf8")) - self.s3.upload_fileobj(file,self.bucket_name,self.filename) - pass - From 6f7d912e20a4f134287f4db1560c4efe49afff57 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 14:06:34 -0500 Subject: [PATCH 198/271] bug fix/refactoring commong IEncoder --- transport/common.py | 17 +++++++++++++++++ transport/nosql/mongodb.py | 16 +++++++++++++++- transport/other/callback.py | 2 +- 3 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 transport/common.py diff --git a/transport/common.py b/transport/common.py new file mode 100644 index 0000000..e17c615 --- /dev/null +++ b/transport/common.py @@ -0,0 +1,17 @@ +import json + + +class IEncoder (json.JSONEncoder): + def default (self,object): + if type(object) == np.integer : + return int(object) + elif type(object) == np.floating: + return float(object) + elif type(object) == np.ndarray : + return object.tolist() + elif type(object) == datetime : + return object.isoformat() + else: + return super(IEncoder,self).default(object) + + diff --git a/transport/nosql/mongodb.py b/transport/nosql/mongodb.py index 2784cd2..00d20ba 100644 --- a/transport/nosql/mongodb.py +++ b/transport/nosql/mongodb.py @@ -17,7 +17,21 @@ import sys import json import re from multiprocessing import Lock, RLock -from transport.common import IEncoder +# from transport.common import IEncoder + +class IEncoder (json.JSONEncoder): + def default (self,object): + if type(object) == np.integer : + return int(object) + elif type(object) == np.floating: + return float(object) + elif type(object) == np.ndarray : + return object.tolist() + elif type(object) == datetime : + return object.isoformat() + else: + return super(IEncoder,self).default(object) + class Mongo : lock = RLock() diff --git a/transport/other/callback.py b/transport/other/callback.py index 29b03fc..c56c175 100644 --- a/transport/other/callback.py +++ b/transport/other/callback.py @@ -1,6 +1,6 @@ import queue from threading import Thread, Lock -from transport.common import Reader,Writer +# from transport.common import Reader,Writer import numpy as np import pandas as pd From eed612b3969e93f36fef242bfd23be1ce55ade4c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 14:09:45 -0500 Subject: [PATCH 199/271] bug fix: import --- transport/nosql/mongodb.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/transport/nosql/mongodb.py b/transport/nosql/mongodb.py index 00d20ba..2784cd2 100644 --- a/transport/nosql/mongodb.py +++ b/transport/nosql/mongodb.py @@ -17,21 +17,7 @@ import sys import json import re from multiprocessing import Lock, RLock -# from transport.common import IEncoder - -class IEncoder (json.JSONEncoder): - def default (self,object): - if type(object) == np.integer : - return int(object) - elif type(object) == np.floating: - return float(object) - elif type(object) == np.ndarray : - return object.tolist() - elif type(object) == datetime : - return object.isoformat() - else: - return super(IEncoder,self).default(object) - +from transport.common import IEncoder class Mongo : lock = RLock() From 383f887db68faa78b3a6b004561a2fee19a58ae8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 14:30:00 -0500 Subject: [PATCH 200/271] V2.0 plugin support --- transport/plugins/__init__.py | 128 ++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 transport/plugins/__init__.py diff --git a/transport/plugins/__init__.py b/transport/plugins/__init__.py new file mode 100644 index 0000000..6117664 --- /dev/null +++ b/transport/plugins/__init__.py @@ -0,0 +1,128 @@ +""" +The functions within are designed to load external files and apply functions against the data +The plugins are applied as + - post-processing if we are reading data + - and pre-processing if we are writing data + +The plugin will use a decorator to identify meaningful functions +@TODO: This should work in tandem with loggin (otherwise we don't have visibility into what is going on) +""" +import importlib as IL +import importlib.util +import sys +import os + +class plugin : + """ + Implementing function decorator for data-transport plugins (post-pre)-processing + """ + def __init__(self,**_args): + """ + :name name of the plugin + :mode restrict to reader/writer + :about tell what the function is about + """ + self._name = _args['name'] + self._about = _args['about'] + self._mode = _args['mode'] if 'mode' in _args else 'rw' + def __call__(self,pointer): + def wrapper(_args): + return pointer(_args) + # + # @TODO: + # add attributes to the wrapper object + # + setattr(wrapper,'transport',True) + setattr(wrapper,'name',self._name) + setattr(wrapper,'mode',self._mode) + setattr(wrapper,'about',self._about) + return wrapper + + +class PluginLoader : + """ + This class is intended to load a plugin and make it available and assess the quality of the developed plugin + """ + def __init__(self,**_args): + """ + :path location of the plugin (should be a single file) + :_names of functions to load + """ + _names = _args['names'] if 'names' in _args else None + path = _args['path'] if 'path' in _args else None + self._names = _names if type(_names) == list else [_names] + self._modules = {} + self._names = [] + if path and os.path.exists(path) and _names: + for _name in self._names : + spec = importlib.util.spec_from_file_location('private', path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) #--loads it into sys.modules + if hasattr(module,_name) : + if self.isplugin(module,_name) : + self._modules[_name] = getattr(module,_name) + else: + print ([f'Found {_name}', 'not plugin']) + else: + # + # @TODO: We should log this somewhere some how + print (['skipping ',_name, hasattr(module,_name)]) + pass + else: + # + # Initialization is empty + self._names = [] + pass + def set(self,_pointer) : + """ + This function will set a pointer to the list of modules to be called + This should be used within the context of using the framework as a library + """ + _name = _pointer.__name__ + + self._modules[_name] = _pointer + self._names.append(_name) + def isplugin(self,module,name): + """ + This function determines if a module is a recognized plugin + :module module object loaded from importlib + :name name of the functiion of interest + """ + + p = type(getattr(module,name)).__name__ =='function' + q = hasattr(getattr(module,name),'transport') + # + # @TODO: add a generated key, and more indepth validation + return p and q + def has(self,_name): + """ + This will determine if the module name is loaded or not + """ + return _name in self._modules + def ratio (self): + """ + how many modules loaded vs unloaded given the list of names + """ + + _n = len(self._names) + return len(set(self._modules.keys()) & set (self._names)) / _n + def apply(self,_data): + for _name in self._modules : + _pointer = self._modules[_name] + # + # @TODO: add exception handling + _data = _pointer(_data) + return _data + # def apply(self,_data,_name): + # """ + # This function applies an external module function against the data. + # The responsibility is on the plugin to properly return data, thus responsibility is offloaded + # """ + # try: + + # _pointer = self._modules[_name] + # _data = _pointer(_data) + + # except Exception as e: + # pass + # return _data From edd3efd3286d795365fc7ffcdc43a0967b91d66c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 15:41:39 -0500 Subject: [PATCH 201/271] bug fixes: imports --- setup.py | 3 ++- transport/__init__.py | 3 ++- transport/nosql/__init__.py | 10 ++++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 40ba3fb..743746e 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,8 @@ args = { "version":__version__, "author":__author__,"author_email":"info@the-phi.com", "license":"MIT", - "packages":["transport","info"]} + # "packages":["transport","info","transport/sql"]}, + "packages": find_packages(include=['info', 'transport.*'])} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" diff --git a/transport/__init__.py b/transport/__init__.py index 387161d..2e2897a 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -18,13 +18,14 @@ Source Code is available under MIT License: """ import numpy as np -from transport import sql, nosql, cloud, other +import sql, nosql, cloud, other import pandas as pd import json import os from info import __version__,__author__ from transport.iowrapper import IWriter, IReader from transport.plugins import PluginLoader +from transport import providers PROVIDERS = {} def init(): global PROVIDERS diff --git a/transport/nosql/__init__.py b/transport/nosql/__init__.py index 465b912..c89b212 100644 --- a/transport/nosql/__init__.py +++ b/transport/nosql/__init__.py @@ -2,9 +2,11 @@ Steve L. Nyemba, nyemba@gmail.com This namespace implements support for cloud databases couchdb,mongodb, cloudant ... """ -from transport.nosql import couchdb -from transport.nosql import mongodb -# from . import mongodb -# from . import couchdb +# from transport.nosql import couchdb +# from transport.nosql import mongodb +from . import mongodb +from . import couchdb +# import mongodb +# import couchdb cloudant = couchdb \ No newline at end of file From 165f9913b519c89ac9061b8fbfd486a8d39daa7e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 16:04:00 -0500 Subject: [PATCH 202/271] bug fix: imports providers (backward compatibility) --- transport/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transport/__init__.py b/transport/__init__.py index 2e2897a..a28b7d9 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -26,6 +26,7 @@ from info import __version__,__author__ from transport.iowrapper import IWriter, IReader from transport.plugins import PluginLoader from transport import providers + PROVIDERS = {} def init(): global PROVIDERS From 90ac26e53e239a99f296a2621090c289581e898e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 16:08:19 -0500 Subject: [PATCH 203/271] bug fixes --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 743746e..be572b0 100644 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ args = { "author":__author__,"author_email":"info@the-phi.com", "license":"MIT", # "packages":["transport","info","transport/sql"]}, + "packages": find_packages(include=['info', 'transport.*'])} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy'] From ed5acec4724e3761afb07bd6de660fc40766c08e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 16:09:51 -0500 Subject: [PATCH 204/271] bug fixes --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index be572b0..3df143d 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ args = { "license":"MIT", # "packages":["transport","info","transport/sql"]}, - "packages": find_packages(include=['info', 'transport.*'])} + "packages": find_packages(include=['info','transport', 'transport.*'])} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" From 549cc2082434deab857f968ff5bd3697dab674aa Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 16:12:04 -0500 Subject: [PATCH 205/271] bug fix ... --- transport/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/__init__.py b/transport/__init__.py index a28b7d9..333931b 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -18,7 +18,7 @@ Source Code is available under MIT License: """ import numpy as np -import sql, nosql, cloud, other +from transport import sql, nosql, cloud, other import pandas as pd import json import os From 4b97994ec19fde14ffda010b8d6c977b883651e8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 1 Apr 2024 18:37:47 -0500 Subject: [PATCH 206/271] bug fix: layout providers --- bin/transport | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bin/transport b/bin/transport index 363d2d9..6d5710d 100755 --- a/bin/transport +++ b/bin/transport @@ -80,7 +80,11 @@ def supported (format:str="table") : This function will print supported providers and their associated classifications """ _df = (transport.supported()) - print (json.dumps(_df.to_dict(orient="list"))) + if format in ['list','json'] : + print (json.dumps(_df.to_dict(orient="list"))) + else: + print (_df) + print () @app.command() def version(): From eb81f5a4d208598979c168e79b2043b7e7a6220d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 2 Apr 2024 12:31:41 -0500 Subject: [PATCH 207/271] bug fix: mongodb inserts of structured objects with lists as elements --- transport/common.py | 3 ++- transport/nosql/mongodb.py | 20 +++++++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/transport/common.py b/transport/common.py index e17c615..f439ea7 100644 --- a/transport/common.py +++ b/transport/common.py @@ -1,5 +1,6 @@ import json - +import numpy as np +from datetime import datetime class IEncoder (json.JSONEncoder): def default (self,object): diff --git a/transport/nosql/mongodb.py b/transport/nosql/mongodb.py index 2784cd2..2b94311 100644 --- a/transport/nosql/mongodb.py +++ b/transport/nosql/mongodb.py @@ -213,12 +213,26 @@ class Writer(Mongo): _uid = self.collection if 'doc' not in _args else _args['doc'] if self._lock : Mongo.lock.acquire() + if type(info) == list or type(info) == pd.DataFrame : - info if type(info) == list else info.to_dict(orient='records') - info = json.loads(json.dumps(info,cls=IEncoder)) + if type(info) == pd.DataFrame : + info = info.to_dict(orient='records') + # info if type(info) == list else info.to_dict(orient='records') + info = json.loads(json.dumps(info)) self.db[_uid].insert_many(info) else: - self.db[_uid].insert_one(json.loads(json.dumps(info,cls=IEncoder))) + # + # sometimes a dictionary can have keys with arrays (odd shaped) + # + _keycount = len(info.keys()) + _arraycount = [len(info[key]) for key in info if type(info[key]) in (list,np.array,np.ndarray)] + if _arraycount and len(_arraycount) == _keycount and np.max(_arraycount) == np.min(_arraycount) : + # + # In case an object with consistent structure is passed, we store it accordingly + # + self.write(pd.DataFrame(info),**_args) + else: + self.db[_uid].insert_one(json.loads(json.dumps(info,cls=IEncoder))) finally: if self._lock : Mongo.lock.release() From 9d75d420178eb3e6031c0e1972bb73b35f09c0d2 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 2 Apr 2024 12:59:26 -0500 Subject: [PATCH 208/271] bug fix: append mode/replace or truncate upon insert --- transport/sql/common.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/transport/sql/common.py b/transport/sql/common.py index 89dcefb..4c9d4a7 100644 --- a/transport/sql/common.py +++ b/transport/sql/common.py @@ -118,8 +118,12 @@ class BaseWriter (SQLBase): # _table = _args['table'] if 'table' in _args else self._table _mode = {'chunksize':2000000,'if_exists':'append','index':False} - if 'schema' in _args : - _mode['schema'] = _args['schema'] - if 'if_exists' in _args : - _mode['if_exists'] = _args['if_exists'] - _df.to_sql(_table,self._engine,**_args,index=False) \ No newline at end of file + for key in ['if_exists','index','chunksize'] : + if key in _args : + _mode[key] = _args[key] + # if 'schema' in _args : + # _mode['schema'] = _args['schema'] + # if 'if_exists' in _args : + # _mode['if_exists'] = _args['if_exists'] + + _df.to_sql(_table,self._engine,**_mode) \ No newline at end of file From 677239585c4520fbca494ba25d670815a11f768e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 2 Apr 2024 16:58:58 -0500 Subject: [PATCH 209/271] bug fix, with bigquery write --- transport/cloud/bigquery.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/transport/cloud/bigquery.py b/transport/cloud/bigquery.py index 479c060..ba720af 100644 --- a/transport/cloud/bigquery.py +++ b/transport/cloud/bigquery.py @@ -104,12 +104,12 @@ class Writer (BigQuery): """ try: if self.parallel or 'lock' in _args : - Write.lock.acquire() + Writer.lock.acquire() _args['table'] = self.table if 'table' not in _args else _args['table'] self._write(_data,**_args) finally: if self.parallel: - Write.lock.release() + Writer.lock.release() def submit(self,_sql): """ Write the output of a massive query to a given table, biquery will handle this as a job @@ -144,13 +144,16 @@ class Writer (BigQuery): # Let us insure that the types are somewhat compatible ... # _map = {'INTEGER':np.int64,'DATETIME':'datetime64[ns]','TIMESTAMP':'datetime64[ns]','FLOAT':np.float64,'DOUBLE':np.float64,'STRING':str} # _mode = copy.deepcopy(self.mode) - _mode = self.mode + # _mode = self.mode # _df.to_gbq(**self.mode) #if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) # # Let us adjust the chunking here + if 'if_exists' in _args : + self.mode['if_exists'] = _args['if_exists'] self._chunks = 10 if _df.shape[0] > MAX_CHUNK and self._chunks == 1 else self._chunks _indexes = np.array_split(np.arange(_df.shape[0]),self._chunks) for i in _indexes : - _df.iloc[i].to_gbq(**self.mode) + # _df.iloc[i].to_gbq(**self.mode) + pd_gbq.to_gbq(_df.iloc[i],**self.mode) time.sleep(1) pass \ No newline at end of file From 715e40407a4ca8d638d0a92a6299cf8a34354484 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 2 Apr 2024 17:00:11 -0500 Subject: [PATCH 210/271] adding notebooks (test/examples --- notebooks/bigquery.ipynb | 169 +++++++++++++++++++++++++++++++++++++ notebooks/mongodb.ipynb | 155 ++++++++++++++++++++++++++++++++++ notebooks/mysql.ipynb | 150 ++++++++++++++++++++++++++++++++ notebooks/postgresql.ipynb | 157 ++++++++++++++++++++++++++++++++++ notebooks/sqlite.ipynb | 139 ++++++++++++++++++++++++++++++ 5 files changed, 770 insertions(+) create mode 100644 notebooks/bigquery.ipynb create mode 100644 notebooks/mongodb.ipynb create mode 100644 notebooks/mysql.ipynb create mode 100644 notebooks/postgresql.ipynb create mode 100644 notebooks/sqlite.ipynb diff --git a/notebooks/bigquery.ipynb b/notebooks/bigquery.ipynb new file mode 100644 index 0000000..750f167 --- /dev/null +++ b/notebooks/bigquery.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing to Google Bigquery\n", + "\n", + "1. Insure you have a Google Bigquery service account key on disk\n", + "2. The service key location is set as an environment variable **BQ_KEY**\n", + "3. The dataset will be automatically created within the project associated with the service key\n", + "\n", + "The cell below creates a dataframe that will be stored within Google Bigquery" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:00<00:00, 5440.08it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['data transport version ', '2.0.0']\n" + ] + } + ], + "source": [ + "#\n", + "# Writing to Google Bigquery database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "import os\n", + "\n", + "PRIVATE_KEY = os.environ['BQ_KEY'] #-- location of the service key\n", + "DATASET = 'demo'\n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "bqw = transport.factory.instance(provider=providers.BIGQUERY,dataset=DATASET,table='friends',context='write',private_key=PRIVATE_KEY)\n", + "bqw.write(_data,if_exists='replace') #-- default is append\n", + "print (['data transport version ', transport.__version__])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from Google Bigquery\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age within a Google Bigquery (simple query). \n", + "\n", + "- Basic read of the designated table (friends) created above\n", + "- Execute an aggregate SQL against the table\n", + "\n", + "**NOTE**\n", + "\n", + "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading: 100%|\u001b[32m██████████\u001b[0m|\n", + "Downloading: 100%|\u001b[32m██████████\u001b[0m|\n", + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "2 Steve Nyemba 44\n", + "--------- STATISTICS ------------\n", + " _counts f0_\n", + "0 3 83.0\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "import os\n", + "PRIVATE_KEY=os.environ['BQ_KEY']\n", + "pgr = transport.instance(provider=providers.BIGQUERY,dataset='demo',table='friends',private_key=PRIVATE_KEY)\n", + "_df = pgr.read()\n", + "_query = 'SELECT COUNT(*) _counts, AVG(age) from demo.friends'\n", + "_sdf = pgr.read(sql=_query)\n", + "print (_df)\n", + "print ('--------- STATISTICS ------------')\n", + "print (_sdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "\n", + "**NOTE**:\n", + "\n", + "The auth_file is intended to be **JSON** formatted" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'dataset': 'demo', 'table': 'friends'}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "{\n", + " \n", + " \"dataset\":\"demo\",\"table\":\"friends\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/mongodb.ipynb b/notebooks/mongodb.ipynb new file mode 100644 index 0000000..0554669 --- /dev/null +++ b/notebooks/mongodb.ipynb @@ -0,0 +1,155 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing to mongodb\n", + "\n", + "Insure mongodb is actually installed on the system, The cell below creates a dataframe that will be stored within mongodb" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.0.0\n" + ] + } + ], + "source": [ + "#\n", + "# Writing to mongodb database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "mgw = transport.factory.instance(provider=providers.MONGODB,db='demo',collection='friends',context='write')\n", + "mgw.write(_data)\n", + "print (transport.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from mongodb\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age within a mongodb pipeline. The code in the background executes an aggregation using **db.runCommand**\n", + "\n", + "- Basic read of the designated collection **find=\\**\n", + "- Executing an aggregate pipeline against a collection **aggreate=\\**\n", + "\n", + "**NOTE**\n", + "\n", + "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "--------- STATISTICS ------------\n", + " _id _counts _mean\n", + "0 0 2 102.5\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "mgr = transport.instance(provider=providers.MONGODB,db='foo',collection='friends')\n", + "_df = mgr.read()\n", + "PIPELINE = [{\"$group\":{\"_id\":0,\"_counts\":{\"$sum\":1}, \"_mean\":{\"$avg\":\"$age\"}}}]\n", + "_sdf = mgr.read(aggregate='friends',pipeline=PIPELINE)\n", + "print (_df)\n", + "print ('--------- STATISTICS ------------')\n", + "print (_sdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "\n", + "**NOTE**:\n", + "\n", + "The auth_file is intended to be **JSON** formatted" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'host': 'klingon.io',\n", + " 'port': 27017,\n", + " 'username': 'me',\n", + " 'password': 'foobar',\n", + " 'db': 'foo',\n", + " 'collection': 'friends',\n", + " 'authSource': '',\n", + " 'mechamism': ''}" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{\n", + " \"host\":\"klingon.io\",\"port\":27017,\"username\":\"me\",\"password\":\"foobar\",\"db\":\"foo\",\"collection\":\"friends\",\n", + " \"authSource\":\"\",\"mechamism\":\"\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/mysql.ipynb b/notebooks/mysql.ipynb new file mode 100644 index 0000000..a54d46d --- /dev/null +++ b/notebooks/mysql.ipynb @@ -0,0 +1,150 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing to MySQL\n", + "\n", + "1. Insure MySQL is actually installed on the system, \n", + "2. There is a database called demo created on the said system\n", + "\n", + "The cell below creates a dataframe that will be stored within postgreSQL" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.0.0\n" + ] + } + ], + "source": [ + "#\n", + "# Writing to PostgreSQL database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "myw = transport.factory.instance(provider=providers.MYSQL,database='demo',table='friends',context='write',auth_file=\"/home/steve/auth-mysql.json\")\n", + "myw.write(_data,if_exists='replace') #-- default is append\n", + "print (transport.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from MySQL\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age within a MySQL (simple query). \n", + "\n", + "- Basic read of the designated table (friends) created above\n", + "- Execute an aggregate SQL against the table\n", + "\n", + "**NOTE**\n", + "\n", + "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "2 Steve Nyemba 44\n", + "--------- STATISTICS ------------\n", + " _counts avg\n", + "0 3 83.0\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "myr = transport.instance(provider=providers.POSTGRESQL,database='demo',table='friends',auth_file='/home/steve/auth-mysql.json')\n", + "_df = myr.read()\n", + "_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n", + "_sdf = myr.read(sql=_query)\n", + "print (_df)\n", + "print ('--------- STATISTICS ------------')\n", + "print (_sdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "\n", + "**NOTE**:\n", + "\n", + "The auth_file is intended to be **JSON** formatted" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'host': 'klingon.io',\n", + " 'port': 3306,\n", + " 'username': 'me',\n", + " 'password': 'foobar',\n", + " 'database': 'demo',\n", + " 'table': 'friends'}" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{\n", + " \"host\":\"klingon.io\",\"port\":3306,\"username\":\"me\",\"password\":\"foobar\",\n", + " \"database\":\"demo\",\"table\":\"friends\"\n", + "}" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/postgresql.ipynb b/notebooks/postgresql.ipynb new file mode 100644 index 0000000..5046f4d --- /dev/null +++ b/notebooks/postgresql.ipynb @@ -0,0 +1,157 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing to PostgreSQL\n", + "\n", + "1. Insure PostgreSQL is actually installed on the system, \n", + "2. There is a database called demo created on the said system\n", + "\n", + "The cell below creates a dataframe that will be stored within postgreSQL" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.0.0\n" + ] + } + ], + "source": [ + "#\n", + "# Writing to PostgreSQL database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "pgw = transport.factory.instance(provider=providers.POSTGRESQL,database='demo',table='friends',context='write')\n", + "pgw.write(_data,if_exists='replace') #-- default is append\n", + "print (transport.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from PostgreSQL\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age within a PostreSQL (simple query). \n", + "\n", + "- Basic read of the designated table (friends) created above\n", + "- Execute an aggregate SQL against the table\n", + "\n", + "**NOTE**\n", + "\n", + "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "2 Steve Nyemba 44\n", + "--------- STATISTICS ------------\n", + " _counts avg\n", + "0 3 83.0\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "pgr = transport.instance(provider=providers.POSTGRESQL,database='demo',table='friends')\n", + "_df = pgr.read()\n", + "_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n", + "_sdf = pgr.read(sql=_query)\n", + "print (_df)\n", + "print ('--------- STATISTICS ------------')\n", + "print (_sdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "\n", + "**NOTE**:\n", + "\n", + "The auth_file is intended to be **JSON** formatted" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'host': 'klingon.io',\n", + " 'port': 5432,\n", + " 'username': 'me',\n", + " 'password': 'foobar',\n", + " 'database': 'demo',\n", + " 'table': 'friends'}" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "{\n", + " \"host\":\"klingon.io\",\"port\":5432,\"username\":\"me\",\"password\":\"foobar\",\n", + " \"database\":\"demo\",\"table\":\"friends\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/sqlite.ipynb b/notebooks/sqlite.ipynb new file mode 100644 index 0000000..5c249de --- /dev/null +++ b/notebooks/sqlite.ipynb @@ -0,0 +1,139 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing to SQLite3+\n", + "\n", + "The requirements to get started are minimal (actually none). The cell below creates a dataframe that will be stored within SQLite 3+" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.0.0\n" + ] + } + ], + "source": [ + "#\n", + "# Writing to PostgreSQL database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "sqw = transport.factory.instance(provider=providers.SQLITE,database='/home/steve/demo.db3',table='friends',context='write')\n", + "sqw.write(_data,if_exists='replace') #-- default is append\n", + "print (transport.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from SQLite3+\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age within a PostreSQL (simple query). \n", + "\n", + "- Basic read of the designated table (friends) created above\n", + "- Execute an aggregate SQL against the table\n", + "\n", + "**NOTE**\n", + "\n", + "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "2 Steve Nyemba 44\n", + "--------- STATISTICS ------------\n", + " _counts AVG(age)\n", + "0 3 83.0\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "pgr = transport.instance(provider=providers.SQLITE,database='/home/steve/demo.db3',table='friends')\n", + "_df = pgr.read()\n", + "_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n", + "_sdf = pgr.read(sql=_query)\n", + "print (_df)\n", + "print ('--------- STATISTICS ------------')\n", + "print (_sdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "\n", + "**NOTE**:\n", + "\n", + "The auth_file is intended to be **JSON** formatted. This is an overkill for SQLite ;-)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "{\n", + " \"provider\":\"sqlite\",\n", + " \"database\":\"/home/steve/demo.db3\",\"table\":\"friends\"\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From e1763b1b192bc34359a7691b6f695c0a6b319977 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 2 Apr 2024 20:59:01 -0500 Subject: [PATCH 211/271] bug fix: ETL, Mongodb --- bin/transport | 8 +++++++- transport/etl.py | 7 ++++++- transport/nosql/mongodb.py | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/bin/transport b/bin/transport index 6d5710d..f483d94 100755 --- a/bin/transport +++ b/bin/transport @@ -62,8 +62,14 @@ def wait(jobs): time.sleep(1) @app.command(name="apply") -def move (path,index=None): +def apply (path,index=None): + """ + This function applies data transport from one source to one or several others + :path path of the configuration file + + :index index of the _item of interest (otherwise everything will be processed) + """ _proxy = lambda _object: _object.write(_object.read()) if os.path.exists(path): file = open(path) diff --git a/transport/etl.py b/transport/etl.py index 162e185..25750de 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -83,7 +83,12 @@ class Transporter(Process): _reader = transport.factory.instance(**self._source) # # If arguments are provided then a query is to be executed (not just a table dump) - return _reader.read() if 'args' not in self._source else _reader.read(**self._source['args']) + if 'cmd' in self._source or 'query' in self._source : + _query = self._source['cmd'] if 'cmd' in self._source else self._source['query'] + return _reader.read(**_query) + else: + return _reader.read() + # return _reader.read() if 'query' not in self._source else _reader.read(**self._source['query']) def _delegate_write(self,_data,**_args): """ diff --git a/transport/nosql/mongodb.py b/transport/nosql/mongodb.py index 2b94311..c498704 100644 --- a/transport/nosql/mongodb.py +++ b/transport/nosql/mongodb.py @@ -218,7 +218,7 @@ class Writer(Mongo): if type(info) == pd.DataFrame : info = info.to_dict(orient='records') # info if type(info) == list else info.to_dict(orient='records') - info = json.loads(json.dumps(info)) + info = json.loads(json.dumps(info,cls=IEncoder)) self.db[_uid].insert_many(info) else: # From f6919ccd9324afe34835fa708c544cca0fcd5513 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 16 Apr 2024 09:42:33 -0500 Subject: [PATCH 212/271] bug fix: set function mongodb used for updates --- transport/nosql/mongodb.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/transport/nosql/mongodb.py b/transport/nosql/mongodb.py index c498704..7c5b8b2 100644 --- a/transport/nosql/mongodb.py +++ b/transport/nosql/mongodb.py @@ -12,7 +12,8 @@ from bson.binary import Binary from datetime import datetime import pandas as pd import numpy as np -import gridfs +# import gridfs +from gridfs import GridFS import sys import json import re @@ -243,13 +244,17 @@ class Writer(Mongo): """ collection = self.db[self.collection] - if collection.count_document() > 0 and '_id' in document: + if collection.count_documents() > 0 and '_id' in document: id = document['_id'] del document['_id'] collection.find_one_and_replace({'_id':id},document) else: - collection.delete_many({}) - self.write(info) + # + # Nothing to be done if we did not find anything + # + pass + # collection.delete_many({}) + # self.write(info) def close(self): Mongo.close(self) # collecton.update_one({"_id":self.collection},document,True) From 1eda49b63a93d17d9262b6ecdde7d465c5a617e8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 17 Apr 2024 23:56:31 -0500 Subject: [PATCH 213/271] documentation --- README.md | 189 +----------------------------------- notebooks/mssqlserver.ipynb | 160 ++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 185 deletions(-) create mode 100644 notebooks/mssqlserver.ipynb diff --git a/README.md b/README.md index eaa176d..ff8bd39 100644 --- a/README.md +++ b/README.md @@ -1,204 +1,23 @@ # Introduction -This project implements an abstraction of objects that can have access to a variety of data stores, implementing read/write with a simple and expressive interface. This abstraction works with **NoSQL** and **SQL** data stores and leverages **pandas**. - -The supported data store providers : - -| Provider | Underlying Drivers | Description | -| :---- | :----: | ----: | -| sqlite| Native SQLite|SQLite3| -| postgresql| psycopg2 | PostgreSQL -| redshift| psycopg2 | Amazon Redshift -| s3| boto3 | Amazon Simple Storage Service -| netezza| nzpsql | IBM Neteeza -| Files: CSV, TSV| pandas| pandas data-frame -| Couchdb| cloudant | Couchbase/Couchdb -| mongodb| pymongo | Mongodb -| mysql| mysql| Mysql -| bigquery| google-bigquery| Google BigQuery -| mariadb| mysql| Mariadb -| rabbitmq|pika| RabbitMQ Publish/Subscribe +This project implements an abstraction of objects that can have access to a variety of data stores, implementing read/write with a simple and expressive interface. This abstraction works with **NoSQL**, **SQL** and **Cloud** data stores and leverages **pandas**. # Why Use Data-Transport ? -Mostly data scientists that don't really care about the underlying database and would like to manipulate data transparently. +Mostly data scientists that don't really care about the underlying database and would like a simple and consistent way to read/write data and have will be well served. Additionally we implemented lightweight Extract Transform Loading API and command line (CLI) tool. 1. Familiarity with **pandas data-frames** 2. Connectivity **drivers** are included 3. Mining data from various sources 4. Useful for data migrations or ETL -# Usage - ## Installation Within the virtual environment perform the following : pip install git+https://github.com/lnyemba/data-transport.git -Once installed **data-transport** can be used as a library in code or a command line interface (CLI), as a CLI it is used for ETL and requires a configuration file. - - -## Data Transport as a Library (in code) ---- - -The data-transport can be used within code as a library, and offers the following capabilities: - -* Read/Write against [mongodb](https://github.com/lnyemba/data-transport/wiki/mongodb) -* Read/Write against tranditional [RDBMS](https://github.com/lnyemba/data-transport/wiki/rdbms) -* Read/Write against [bigquery](https://github.com/lnyemba/data-transport/wiki/bigquery) -* ETL CLI/Code [ETL](https://github.com/lnyemba/data-transport/wiki/etl) -* Support for pre/post conditions i.e it is possible to specify queries to run before or after a read or write - -The read/write functions make data-transport a great candidate for **data-science**; **data-engineering** or all things pertaining to data. It enables operations across multiple data-stores(relational or not) - -## ETL - -**Embedded in Code** - -It is possible to perform ETL within custom code as follows : - -``` - import transport - import time - - _info = [{source:{'provider':'sqlite','path':'/home/me/foo.csv','table':'me',"pipeline":{"pre":[],"post":[]}},target:{provider:'bigquery',private_key='/home/me/key.json','table':'me','dataset':'mydataset'}}, ...] - procs = transport.factory.instance(provider='etl',info=_info) - # - # - while procs: - procs = [pthread for pthread in procs if pthread.is_alive()] - time.sleep(1) -``` - -**Command Line Interface (CLI):** ---- -The CLI program is called **transport** and it requires a configuration file. The program is intended to move data from one location to another. Supported data stores are in the above paragraphs. - -``` -[ - { - "id":"logs", - "source":{ - "provider":"postgresql","context":"read","database":"mydb", - "cmd":{"sql":"SELECT * FROM logs limit 10"} - }, - "target":{ - "provider":"bigquery","private_key":"/bgqdrive/account/bq-service-account-key.json", - "dataset":"mydataset" - } - }, - -] -``` - -Assuming the above content is stored in a file called **etl-config.json**, we would perform the following in a terminal window: - -``` -[steve@data-transport]$ transport --config ./etl-config.json [--index ] -``` - -**Reading/Writing Mongodb** - -For this example we assume here we are tunneling through port 27018 and there is not access control: - -``` -import transport -reader = factory.instance(provider='mongodb',context='read',host='localhost',port='27018',db='example',doc='logs') - -df = reader.read() #-- reads the entire collection -print (df.head()) -# -#-- Applying mongodb command -PIPELINE = [{"$group":{"_id":None,"count":{"$sum":1}}}] -_command_={"cursor":{},"allowDiskUse":True,"aggregate":"logs","pipeline":PIPLINE} -df = reader.read(mongo=_command) -print (df.head()) -reader.close() -``` -**Read/Writing to Mongodb** ---- - -Scenario 1: Mongodb with security in place - -1. Define an authentication file on disk - - The semantics of the attributes are provided by mongodb, please visit [mongodb documentation](https://mongodb.org/docs). In this example the file is located on _/transport/mongo.json_ -
-
-configuration file - -``` -{ - "username":"me","password":"changeme", - "mechanism":"SCRAM-SHA-1", - "authSource":"admin" -} -``` -Connecting to Mongodb - -``` -import transport -PIPELINE = ... #-- do this yourself -MONGO_KEY = '/transport/mongo.json' -mreader = transport.factory.instance(provider=transport.providers.MONGODB,auth_file=MONGO_KEY,context='read',db='mydb',doc='logs') -_aggregateDF = mreader.read(mongo=PIPELINE) #--results of a aggregate pipeline -_collectionDF= mreader.read() - - -``` - -In order to enable write, change **context** attribute to **'read'**. -
-
-- The configuration file is in JSON format -- The commands passed to mongodb are the same as you would if you applied runCommand in mongodb -- The output is a pandas data-frame -- By default the transport reads, to enable write operations use **context='write'** - -|parameters|description | -| --- | --- | -|db| Name of the database| -|port| Port number to connect to -|doc| Name of the collection of documents| -|username|Username | -|password|password| -|authSource|user database that has authentication info| -|mechanism|Mechnism used for authentication| - -**NOTE** - -Arguments like **db** or **doc** can be placed in the authentication file -
-
- -**Limitations** - -Reads and writes aren't encapsulated in the same object, this is to allow the calling code to deliberately perform actions and hopefully minimize accidents associated with data wrangling. - - -``` -import transport -improt pandas as pd -writer = factory.instance(provider=transport.providers.MONGODB,context='write',host='localhost',port='27018',db='example',doc='logs') - -df = pd.DataFrame({"names":["steve","nico"],"age":[40,30]}) -writer.write(df) -writer.close() -``` - +## Learn More - # - # reading from postgresql - - pgreader = factory.instance(type='postgresql',database=,table=) - pg.read() #-- will read the table by executing a SELECT - pg.read(sql=) - - # - # Reading a document and executing a view - # - document = dreader.read() - result = couchdb.view(id='',view_name=) - +We have available notebooks with sample code to read/write against mongodb, couchdb, Netezza, PostgreSQL, Google Bigquery, Databricks, Microsoft SQL Server, MySQL ... Visit [data-transport homepage](https://healthcareio.the-phi.com/data-transport) \ No newline at end of file diff --git a/notebooks/mssqlserver.ipynb b/notebooks/mssqlserver.ipynb new file mode 100644 index 0000000..f2bee85 --- /dev/null +++ b/notebooks/mssqlserver.ipynb @@ -0,0 +1,160 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing to Microsoft SQLServer\n", + "\n", + "1. Insure the Microsoft SQL Server is installed and you have access i.e account information\n", + "2. The target database must be created before hand.\n", + "3. We created an authentication file that will contain user account and location of the database\n", + "\n", + "The cell below creates a dataframe that will be stored in a Microsoft SQL Server database.\n", + "\n", + "**NOTE** This was not tested with a cloud instance" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['data transport version ', '2.0.0']\n" + ] + } + ], + "source": [ + "#\n", + "# Writing to Google Bigquery database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "import os\n", + "\n", + "AUTH_FOLDER = os.environ['DT_AUTH_FOLDER'] #-- location of the service key\n", + "MSSQL_AUTH_FILE= os.sep.join([AUTH_FOLDER,'mssql.json'])\n", + "\n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "msw = transport.factory.instance(provider=providers.MSSQL,table='friends',context='write',auth_file=MSSQL_AUTH_FILE)\n", + "msw.write(_data,if_exists='replace') #-- default is append\n", + "print (['data transport version ', transport.__version__])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from Microsoft SQL Server database\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age within an MS SQL Server (simple query). \n", + "\n", + "- Basic read of the designated table (friends) created above\n", + "- Execute an aggregate SQL against the table\n", + "\n", + "**NOTE**\n", + "\n", + "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "2 Steve Nyemba 44\n", + "\n", + "--------- STATISTICS ------------\n", + "\n", + " _counts \n", + "0 3 83\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "import os\n", + "AUTH_FOLDER = os.environ['DT_AUTH_FOLDER'] #-- location of the service key\n", + "MSSQL_AUTH_FILE= os.sep.join([AUTH_FOLDER,'mssql.json'])\n", + "\n", + "msr = transport.instance(provider=providers.MSSQL,table='friends',auth_file=MSSQL_AUTH_FILE)\n", + "_df = msr.read()\n", + "_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n", + "_sdf = msr.read(sql=_query)\n", + "print (_df)\n", + "print ('\\n--------- STATISTICS ------------\\n')\n", + "print (_sdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "\n", + "**NOTE**:\n", + "\n", + "The auth_file is intended to be **JSON** formatted" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'dataset': 'demo', 'table': 'friends'}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "{\n", + " \n", + " \"dataset\":\"demo\",\"table\":\"friends\",\"username\":\"\",\"password\":\"\"\n", + "}" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 67b91b43ab24c47abb987cfe3f03a7f2b64bfba3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 23 Apr 2024 13:00:14 -0500 Subject: [PATCH 214/271] new: sqlserver and other refactoring --- info/__init__.py | 2 +- setup.py | 8 ++++---- transport/providers/__init__.py | 4 +++- transport/sql/__init__.py | 2 +- transport/sql/sqlserver.py | 24 ++++++++++++++++++++++++ 5 files changed, 33 insertions(+), 7 deletions(-) create mode 100644 transport/sql/sqlserver.py diff --git a/info/__init__.py b/info/__init__.py index 2d27032..0594d12 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,5 +1,5 @@ __author__ = 'The Phi Technology' -__version__= '2.0.0' +__version__= '2.0.2' __license__=""" diff --git a/setup.py b/setup.py index 3df143d..8e9de26 100644 --- a/setup.py +++ b/setup.py @@ -22,10 +22,10 @@ args = { "packages": find_packages(include=['info','transport', 'transport.*'])} args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy'] +args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] -if sys.version_info[0] == 2 : - args['use_2to3'] = True - args['use_2to3_exclude_fixers']=['lib2to3.fixes.fix_import'] +# if sys.version_info[0] == 2 : +# args['use_2to3'] = True +# args['use_2to3_exclude_fixers']=['lib2to3.fixes.fix_import'] setup(**args) diff --git a/transport/providers/__init__.py b/transport/providers/__init__.py index fc0f1e7..4a583f7 100644 --- a/transport/providers/__init__.py +++ b/transport/providers/__init__.py @@ -26,7 +26,9 @@ S3 = 's3' CALLBACK = 'callback' CONSOLE = 'console' RABBITMQ = 'rabbitmq' -DATABRICKS= 'databricks' +DATABRICKS = 'databricks' +MSSQL ='sqlserver' +SQLSERVER ='sqlserver' # # synonyms of the above diff --git a/transport/sql/__init__.py b/transport/sql/__init__.py index 557d36d..9d026bf 100644 --- a/transport/sql/__init__.py +++ b/transport/sql/__init__.py @@ -3,7 +3,7 @@ This namespace/package wrap the sql functionalities for a certain data-stores - netezza, postgresql, mysql and sqlite - mariadb, redshift (also included) """ -from . import postgresql, mysql, netezza, sqlite +from . import postgresql, mysql, netezza, sqlite, sqlserver # diff --git a/transport/sql/sqlserver.py b/transport/sql/sqlserver.py new file mode 100644 index 0000000..6a53842 --- /dev/null +++ b/transport/sql/sqlserver.py @@ -0,0 +1,24 @@ +""" +Handling Microsoft SQL Server via pymssql driver/connector +""" +import sqlalchemy +import pandas as pd +from transport.sql.common import Base, BaseReader, BaseWriter + + +class MsSQLServer: + def __init__(self,**_args) : + super().__init__(**_args) + pass + def get_provider(self): + # mssql+pymssql://scott:tiger@hostname:port/dbname" + return "mssql+pymssql" + def get_default_port(self): + return "1433" +class Reader (MsSQLServer,BaseReader): + def __init__(self,**_args): + super().__init__(**_args) + +class Writer (MsSQLServer,BaseWriter): + def __init__(self,**_args): + super().__init__(**_args) \ No newline at end of file From 5adbb5a61e423f32f3e90fb36527bac399d55e3b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 24 Apr 2024 13:00:03 -0500 Subject: [PATCH 215/271] bug fixes and documentation --- README.md | 5 +++-- transport/iowrapper.py | 14 +++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index ff8bd39..528176d 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,13 @@ This project implements an abstraction of objects that can have access to a vari # Why Use Data-Transport ? -Mostly data scientists that don't really care about the underlying database and would like a simple and consistent way to read/write data and have will be well served. Additionally we implemented lightweight Extract Transform Loading API and command line (CLI) tool. +Mostly data scientists that don't really care about the underlying database and would like a simple and consistent way to read/write and move data are well served. Additionally we implemented lightweight Extract Transform Loading API and command line (CLI) tool. Finally it is possible to add pre/post processing pipeline functions to read/write 1. Familiarity with **pandas data-frames** 2. Connectivity **drivers** are included 3. Mining data from various sources -4. Useful for data migrations or ETL +4. Useful for data migrations or **ETL** + ## Installation diff --git a/transport/iowrapper.py b/transport/iowrapper.py index f113d85..df6b2ec 100644 --- a/transport/iowrapper.py +++ b/transport/iowrapper.py @@ -6,9 +6,9 @@ class IO: """ Base wrapper class for read/write """ - def __init__(self,_agent,loader): + def __init__(self,_agent,plugins): self._agent = _agent - self._loader = loader + self._plugins = plugins def meta (self,**_args): if hasattr(self._agent,'meta') : return self._agent.meta(**_args) @@ -21,7 +21,7 @@ class IO: """ applying pre/post conditions given a pipeline expression """ - for _pointer in self._loader : + for _pointer in self._plugins : _data = _pointer(_data) def apply(self,_query): if hasattr(self._agent,'apply') : @@ -32,8 +32,8 @@ class IReader(IO): super().__init__(_agent,pipeline) def read(self,**_args): _data = self._agent.read(**_args) - if self._loader and self._loader.ratio() > 0 : - _data = self._loader.apply(_data) + if self._plugins and self._plugins.ratio() > 0 : + _data = self._plugins.apply(_data) # # output data return _data @@ -41,7 +41,7 @@ class IWriter(IO): def __init__(self,_agent,pipeline=None): super().__init__(_agent,pipeline) def write(self,_data,**_args): - if self._loader and self._loader.ratio() > 0 : - _data = self._loader.apply(_data) + if self._plugins and self._plugins.ratio() > 0 : + _data = self._plugins.apply(_data) self._agent.write(_data,**_args) From f5187790ced0b23c820738467476e20fe8c11825 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 10 Jun 2024 00:42:42 -0500 Subject: [PATCH 216/271] refactor: etl,better reusability & streamlined and threaded --- bin/transport | 51 +++++++----- info/__init__.py | 8 +- setup.py | 9 +-- transport/__init__.py | 75 ++++++++++++++---- transport/etl.py | 141 ++++++++++++++++------------------ transport/iowrapper.py | 84 +++++++++++++++++++- transport/other/files.py | 5 +- transport/plugins/__init__.py | 9 ++- 8 files changed, 251 insertions(+), 131 deletions(-) diff --git a/bin/transport b/bin/transport index f483d94..fd5d41b 100755 --- a/bin/transport +++ b/bin/transport @@ -44,12 +44,15 @@ import sys import transport import time from multiprocessing import Process -import typer + import os import transport from transport import etl # from transport import providers - +import typer +from typing_extensions import Annotated +from typing import Optional +import time app = typer.Typer() @@ -62,28 +65,33 @@ def wait(jobs): time.sleep(1) @app.command(name="apply") -def apply (path,index=None): +def apply (path:Annotated[str,typer.Argument(help="path of the configuration file")], + index:int = typer.Option(help="index of the item of interest, otherwise everything in the file will be processed")): """ This function applies data transport from one source to one or several others - - :path path of the configuration file - - :index index of the _item of interest (otherwise everything will be processed) """ - _proxy = lambda _object: _object.write(_object.read()) + # _proxy = lambda _object: _object.write(_object.read()) if os.path.exists(path): file = open(path) _config = json.loads (file.read() ) file.close() if index : - _config = _config[ int(index)] - etl.instance(**_config) - else: - etl.instance(config=_config) + _config = [_config[ int(index)]] + jobs = [] + for _args in _config : + pthread = etl.instance(**_args) #-- automatically starts the process + jobs.append(pthread) + # + # @TODO: Log the number of processes started and estimated time + while jobs : + jobs = [pthread for pthread in jobs if pthread.is_alive()] + time.sleep(1) + # + # @TODO: Log the job termination here ... @app.command(name="providers") -def supported (format:str="table") : +def supported (format:Annotated[str,typer.Argument(help="format of the output, supported formats are (list,table,json)")]="table") : """ - This function will print supported providers and their associated classifications + This function will print supported providers/vendors and their associated classifications """ _df = (transport.supported()) if format in ['list','json'] : @@ -94,9 +102,15 @@ def supported (format:str="table") : @app.command() def version(): - print (transport.version.__version__) + """ + This function will display version and license information + """ + + print (transport.__app_name__,'version ',transport.__version__) + print (transport.__license__) + @app.command() -def generate (path:str): +def generate (path:Annotated[str,typer.Argument(help="path of the ETL configuration file template (name included)")]): """ This function will generate a configuration template to give a sense of how to create one """ @@ -104,15 +118,12 @@ def generate (path:str): { "source":{"provider":"http","url":"https://raw.githubusercontent.com/codeforamerica/ohana-api/master/data/sample-csv/addresses.csv"}, "target": - [{"provider":"file","path":"addresses.csv","delimiter":"csv"},{"provider":"sqlite","database":"sample.db3","table":"addresses"}] + [{"provider":"files","path":"addresses.csv","delimiter":","},{"provider":"sqlite","database":"sample.db3","table":"addresses"}] } ] file = open(path,'w') file.write(json.dumps(_config)) file.close() -@app.command() -def usage(): - print (__doc__) if __name__ == '__main__' : app() # # diff --git a/info/__init__.py b/info/__init__.py index 0594d12..f45fdcd 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,8 +1,8 @@ +__app_name__ = 'data-transport' __author__ = 'The Phi Technology' -__version__= '2.0.2' -__license__=""" - - +__version__= '2.0.4' +__email__ = "info@the-phi.com" +__license__=f""" Copyright 2010 - 2024, Steve L. Nyemba Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: diff --git a/setup.py b/setup.py index 8e9de26..002feb8 100644 --- a/setup.py +++ b/setup.py @@ -5,19 +5,16 @@ from setuptools import setup, find_packages import os import sys # from version import __version__,__author__ -from info import __version__, __author__ +from info import __version__, __author__,__app_name__,__license__ -# __author__ = 'The Phi Technology' -# __version__= '1.8.0' - def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { - "name":"data-transport", + "name":__app_name__, "version":__version__, "author":__author__,"author_email":"info@the-phi.com", - "license":"MIT", + "license":__license__, # "packages":["transport","info","transport/sql"]}, "packages": find_packages(include=['info','transport', 'transport.*'])} diff --git a/transport/__init__.py b/transport/__init__.py index 333931b..d7d4518 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -22,8 +22,8 @@ from transport import sql, nosql, cloud, other import pandas as pd import json import os -from info import __version__,__author__ -from transport.iowrapper import IWriter, IReader +from info import __version__,__author__,__email__,__license__,__app_name__ +from transport.iowrapper import IWriter, IReader, IETL from transport.plugins import PluginLoader from transport import providers @@ -32,26 +32,35 @@ def init(): global PROVIDERS for _module in [cloud,sql,nosql,other] : for _provider_name in dir(_module) : - if _provider_name.startswith('__') : + if _provider_name.startswith('__') or _provider_name == 'common': continue PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__} def instance (**_args): """ - type: - read: true|false (default true) - auth_file + This function returns an object of to read or write from a supported database provider/vendor + @provider provider + @context read/write (default is read) + @auth_file: Optional if the database information provided is in a file. Useful for not sharing passwords + kwargs These are arguments that are provider/vendor specific """ global PROVIDERS if 'auth_file' in _args: if os.path.exists(_args['auth_file']) : + # + # @TODO: add encryption module and decryption to enable this to be secure + # + f = open(_args['auth_file']) - _args = dict (_args,** json.loads(f.read()) ) + #_args = dict (_args,** json.loads(f.read()) ) + # + # we overrite file parameters with arguments passed + _args = dict (json.loads(f.read()),**_args ) f.close() else: filename = _args['auth_file'] raise Exception(f" {filename} was not found or is invalid") - if _args['provider'] in PROVIDERS : + if 'provider' in _args and _args['provider'] in PROVIDERS : _info = PROVIDERS[_args['provider']] _module = _info['module'] if 'context' in _args : @@ -62,22 +71,54 @@ def instance (**_args): _agent = _pointer (**_args) # loader = None - if 'plugins' in _args : - _params = _args['plugins'] - - if 'path' in _params and 'names' in _params : - loader = PluginLoader(**_params) - elif type(_params) == list: - loader = PluginLoader() - for _delegate in _params : - loader.set(_delegate) + + # + # @TODO: + # define a logger object here that will used by the wrapper + # this would allow us to know what the data-transport is doing and where/how it fails + # + # if 'plugins' in _args : + # _params = _args['plugins'] + # if 'path' in _params and 'names' in _params : + # loader = PluginLoader(**_params) + # elif type(_params) == list: + # loader = PluginLoader() + # for _delegate in _params : + # loader.set(_delegate) + + loader = None if 'plugins' not in _args else _args['plugins'] return IReader(_agent,loader) if _context == 'read' else IWriter(_agent,loader) else: + # + # We can handle the case for an ETL object + # raise Exception ("Missing or Unknown provider") pass +class get : + """ + This class is just a wrapper to make the interface (API) more conversational and easy to understand + """ + @staticmethod + def reader (**_args): + _args['context'] = 'read' + return instance(**_args) + @staticmethod + def writer(**_args): + """ + This function is a wrapper that will return a writer to a database. It disambiguates the interface + """ + _args['context'] = 'write' + return instance(**_args) + @staticmethod + def etl (**_args): + if 'source' in _args and 'target' in _args : + return IETL(**_args) + else: + raise Exception ("Malformed input found, object must have both 'source' and 'target' attributes") + def supported (): _info = {} for _provider in PROVIDERS : diff --git a/transport/etl.py b/transport/etl.py index 25750de..2c60e04 100644 --- a/transport/etl.py +++ b/transport/etl.py @@ -39,22 +39,22 @@ import os from multiprocessing import Process -SYS_ARGS = {} -if len(sys.argv) > 1: +# SYS_ARGS = {} +# if len(sys.argv) > 1: - N = len(sys.argv) - for i in range(1,N): - value = None - if sys.argv[i].startswith('--'): - key = sys.argv[i][2:] #.replace('-','') - SYS_ARGS[key] = 1 - if i + 1 < N: - value = sys.argv[i + 1] = sys.argv[i+1].strip() - if key and value and not value.startswith('--'): - SYS_ARGS[key] = value +# N = len(sys.argv) +# for i in range(1,N): +# value = None +# if sys.argv[i].startswith('--'): +# key = sys.argv[i][2:] #.replace('-','') +# SYS_ARGS[key] = 1 +# if i + 1 < N: +# value = sys.argv[i + 1] = sys.argv[i+1].strip() +# if key and value and not value.startswith('--'): +# SYS_ARGS[key] = value - i += 2 +# i += 2 class Transporter(Process): """ The transporter (Jason Stathem) moves data from one persistant store to another @@ -74,81 +74,72 @@ class Transporter(Process): # # Let's insure we can support multiple targets self._target = [self._target] if type(self._target) != list else self._target - pass - def read(self,**_args): - """ - This function - """ - _reader = transport.factory.instance(**self._source) + def run(self): + + _reader = transport.get.etl(source=self._source,target=self._target) # - # If arguments are provided then a query is to be executed (not just a table dump) if 'cmd' in self._source or 'query' in self._source : _query = self._source['cmd'] if 'cmd' in self._source else self._source['query'] return _reader.read(**_query) else: return _reader.read() - # return _reader.read() if 'query' not in self._source else _reader.read(**self._source['query']) + + # def _read(self,**_args): + # """ + # This function + # """ + # _reader = transport.factory.instance(**self._source) + # # + # # If arguments are provided then a query is to be executed (not just a table dump) + # if 'cmd' in self._source or 'query' in self._source : + # _query = self._source['cmd'] if 'cmd' in self._source else self._source['query'] + # return _reader.read(**_query) + # else: + # return _reader.read() + # # return _reader.read() if 'query' not in self._source else _reader.read(**self._source['query']) - def _delegate_write(self,_data,**_args): - """ - This function will write a data-frame to a designated data-store, The function is built around a delegation design pattern - :data data-frame or object to be written - """ - if _data.shape[0] > 0 : - for _target in self._target : - if 'write' not in _target : - _target['context'] = 'write' - # _target['lock'] = True - else: - # _target['write']['lock'] = True - pass - _writer = transport.factory.instance(**_target) - _writer.write(_data,**_args) - if hasattr(_writer,'close') : - _writer.close() + # def _delegate_write(self,_data,**_args): + # """ + # This function will write a data-frame to a designated data-store, The function is built around a delegation design pattern + # :data data-frame or object to be written + # """ + # if _data.shape[0] > 0 : + # for _target in self._target : + # if 'write' not in _target : + # _target['context'] = 'write' + # # _target['lock'] = True + # else: + # # _target['write']['lock'] = True + # pass + # _writer = transport.factory.instance(**_target) + # _writer.write(_data,**_args) + # if hasattr(_writer,'close') : + # _writer.close() - def write(self,_df,**_args): - """ - """ - SEGMENT_COUNT = 6 - MAX_ROWS = 1000000 - # _df = self.read() - _segments = np.array_split(np.arange(_df.shape[0]),SEGMENT_COUNT) if _df.shape[0] > MAX_ROWS else np.array( [np.arange(_df.shape[0])]) - # _index = 0 + # def write(self,_df,**_args): + # """ + # """ + # SEGMENT_COUNT = 6 + # MAX_ROWS = 1000000 + # # _df = self.read() + # _segments = np.array_split(np.arange(_df.shape[0]),SEGMENT_COUNT) if _df.shape[0] > MAX_ROWS else np.array( [np.arange(_df.shape[0])]) + # # _index = 0 - for _indexes in _segments : - _fwd_args = {} if not _args else _args + # for _indexes in _segments : + # _fwd_args = {} if not _args else _args - self._delegate_write(_df.iloc[_indexes],**_fwd_args) - time.sleep(1) - # - # @TODO: Perhaps consider writing up each segment in a thread/process (speeds things up?) - pass + # self._delegate_write(_df.iloc[_indexes],**_fwd_args) + # time.sleep(1) + # # + # # @TODO: Perhaps consider writing up each segment in a thread/process (speeds things up?) + # pass def instance(**_args): - _proxy = lambda _agent: _agent.write(_agent.read()) - if 'source' in _args and 'target' in _args : - - _agent = Transporter(**_args) - _proxy(_agent) - - else: - _config = _args['config'] - _items = [Transporter(**_item) for _item in _config ] - _MAX_JOBS = 5 - _items = np.array_split(_items,_MAX_JOBS) - for _batch in _items : - jobs = [] - for _item in _batch : - thread = Process(target=_proxy,args = (_item,)) - thread.start() - jobs.append(thread) - while jobs : - jobs = [thread for thread in jobs if thread.is_alive()] - time.sleep(1) - + pthread = Transporter (**_args) + pthread.start() + return pthread pass # class Post(Process): # def __init__(self,**args): @@ -360,4 +351,4 @@ def instance(**_args): # print (["Finished ",(N-len(procs)), " remaining ", len(procs)]) # N = len(procs) # time.sleep(1) -# # print ("We're done !!") \ No newline at end of file +# # print ("We're done !!") diff --git a/transport/iowrapper.py b/transport/iowrapper.py index df6b2ec..e3ff611 100644 --- a/transport/iowrapper.py +++ b/transport/iowrapper.py @@ -1,14 +1,39 @@ """ This class is a wrapper around read/write classes of cloud,sql,nosql,other packages -The wrapper allows for application of plugins as pre-post conditions +The wrapper allows for application of plugins as pre-post conditions. +NOTE: Plugins are converted to a pipeline, so we apply a pipeline when reading or writing: + - upon initialization we will load plugins + - on read/write we apply a pipeline (if passed as an argument) """ +from transport.plugins import plugin, PluginLoader +import transport +from transport import providers +from multiprocessing import Process +import time + + class IO: """ - Base wrapper class for read/write + Base wrapper class for read/write and support for logs """ def __init__(self,_agent,plugins): self._agent = _agent - self._plugins = plugins + if plugins : + self._init_plugins(plugins) + else: + self._plugins = None + + def _init_plugins(self,_args): + """ + This function will load pipelined functions as a plugin loader + """ + if 'path' in _args and 'names' in _args : + self._plugins = PluginLoader(**_args) + else: + self._plugins = PluginLoader() + [self._plugins.set(_pointer) for _pointer in _args] + # + # @TODO: We should have a way to log what plugins are loaded and ready to use def meta (self,**_args): if hasattr(self._agent,'meta') : return self._agent.meta(**_args) @@ -28,9 +53,14 @@ class IO: return self._agent.apply(_query) return None class IReader(IO): + """ + This is a wrapper for read functionalities + """ def __init__(self,_agent,pipeline=None): super().__init__(_agent,pipeline) def read(self,**_args): + if 'pipeline' in _args : + self._init_plugins(_args['pipeline']) _data = self._agent.read(**_args) if self._plugins and self._plugins.ratio() > 0 : _data = self._plugins.apply(_data) @@ -41,7 +71,55 @@ class IWriter(IO): def __init__(self,_agent,pipeline=None): super().__init__(_agent,pipeline) def write(self,_data,**_args): + if 'pipeline' in _args : + self._init_plugins(_args['pipeline']) if self._plugins and self._plugins.ratio() > 0 : _data = self._plugins.apply(_data) self._agent.write(_data,**_args) + +# +# The ETL object in its simplest form is an aggregation of read/write objects +# @TODO: ETL can/should aggregate a writer as a plugin and apply it as a process + +def _ProcessWriter (_data,_args): + writer = transport.get.writer(**_args) + writer.write(_data) + +class IETL(IReader) : + """ + This class performs an ETL operation by ineriting a read and adding writes as pipeline functions + """ + def __init__(self,**_args): + super().__init__(transport.get.reader(**_args['source'])) + if 'target' in _args: + self._targets = _args['target'] if type(_args['target']) == list else [_args['target']] + else: + self._targets = [] + self.jobs = [] + # + # If the parent is already multiprocessing + self._hasParentProcess = False if 'hasParentProcess' not in _args else _args['hasParentProcess'] + def read(self,**_args): + _data = super().read(**_args) + + for _kwargs in self._targets : + self.post(_data,**_kwargs) + # pthread = Process(target=_ProcessWriter,args=(_data,_kwargs)) + # pthread.start() + # self.jobs.append(pthread) + + # if not self._hasParentProcess : + # while self.jobs : + # jobs = [pthread for pthread in self.jobs if pthread.is_alive()] + # time.sleep(1) + + return _data + def post (self,_data,**_args) : + """ + This function returns an instance of a process that will perform the write operation + :_args parameters associated with writer object + """ + writer = transport.get.writer(**_args) + writer.write(_data) + writer.close() \ No newline at end of file diff --git a/transport/other/files.py b/transport/other/files.py index a4e8a08..62ee3c4 100644 --- a/transport/other/files.py +++ b/transport/other/files.py @@ -53,8 +53,8 @@ class Writer (File): """ try: - _delim = self._delimiter if 'delimiter' not in _args else _args['delimiter'] - _path = self._path if 'path' not in _args else _args['path'] + _delim = self.delimiter if 'delimiter' not in _args else _args['delimiter'] + _path = self.path if 'path' not in _args else _args['path'] _mode = self._mode if 'mode' not in _args else _args['mode'] info.to_csv(_path,index=False,sep=_delim) @@ -62,6 +62,7 @@ class Writer (File): except Exception as e: # # Not sure what should be done here ... + print (e) pass finally: # DiskWriter.THREAD_LOCK.release() diff --git a/transport/plugins/__init__.py b/transport/plugins/__init__.py index 6117664..26e5782 100644 --- a/transport/plugins/__init__.py +++ b/transport/plugins/__init__.py @@ -25,9 +25,9 @@ class plugin : self._name = _args['name'] self._about = _args['about'] self._mode = _args['mode'] if 'mode' in _args else 'rw' - def __call__(self,pointer): - def wrapper(_args): - return pointer(_args) + def __call__(self,pointer,**kwargs): + def wrapper(_args,**kwargs): + return pointer(_args,**kwargs) # # @TODO: # add attributes to the wrapper object @@ -55,6 +55,7 @@ class PluginLoader : self._names = [] if path and os.path.exists(path) and _names: for _name in self._names : + spec = importlib.util.spec_from_file_location('private', path) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) #--loads it into sys.modules @@ -101,7 +102,7 @@ class PluginLoader : return _name in self._modules def ratio (self): """ - how many modules loaded vs unloaded given the list of names + This functiion determines how many modules loaded vs unloaded given the list of names """ _n = len(self._names) From 870c1caed3688205b5808c5f90d70dabb24f03c4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 10 Jun 2024 01:03:59 -0500 Subject: [PATCH 217/271] bug fix: use plugins to refer to plugins --- transport/iowrapper.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/transport/iowrapper.py b/transport/iowrapper.py index e3ff611..d6cba1c 100644 --- a/transport/iowrapper.py +++ b/transport/iowrapper.py @@ -59,8 +59,8 @@ class IReader(IO): def __init__(self,_agent,pipeline=None): super().__init__(_agent,pipeline) def read(self,**_args): - if 'pipeline' in _args : - self._init_plugins(_args['pipeline']) + if 'plugins' in _args : + self._init_plugins(_args['plugins']) _data = self._agent.read(**_args) if self._plugins and self._plugins.ratio() > 0 : _data = self._plugins.apply(_data) @@ -71,8 +71,8 @@ class IWriter(IO): def __init__(self,_agent,pipeline=None): super().__init__(_agent,pipeline) def write(self,_data,**_args): - if 'pipeline' in _args : - self._init_plugins(_args['pipeline']) + if 'plugins' in _args : + self._init_plugins(_args['plugins']) if self._plugins and self._plugins.ratio() > 0 : _data = self._plugins.apply(_data) @@ -82,10 +82,6 @@ class IWriter(IO): # The ETL object in its simplest form is an aggregation of read/write objects # @TODO: ETL can/should aggregate a writer as a plugin and apply it as a process -def _ProcessWriter (_data,_args): - writer = transport.get.writer(**_args) - writer.write(_data) - class IETL(IReader) : """ This class performs an ETL operation by ineriting a read and adding writes as pipeline functions @@ -105,15 +101,7 @@ class IETL(IReader) : for _kwargs in self._targets : self.post(_data,**_kwargs) - # pthread = Process(target=_ProcessWriter,args=(_data,_kwargs)) - # pthread.start() - # self.jobs.append(pthread) - - # if not self._hasParentProcess : - # while self.jobs : - # jobs = [pthread for pthread in self.jobs if pthread.is_alive()] - # time.sleep(1) - + return _data def post (self,_data,**_args) : """ From d0472ccee5c6cdbe138d1e54fdb86b4244942609 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 10 Jun 2024 01:27:58 -0500 Subject: [PATCH 218/271] documentation added (notebooks) --- notebooks/bigquery.ipynb | 45 +++++++------------------ notebooks/mongodb.ipynb | 51 +++++++--------------------- notebooks/mssqlserver.ipynb | 66 ++++++++++++++++--------------------- notebooks/mysql.ipynb | 39 ++++++++++++++-------- notebooks/postgresql.ipynb | 29 +++++++++------- notebooks/sqlite.ipynb | 22 ++++++++----- 6 files changed, 107 insertions(+), 145 deletions(-) diff --git a/notebooks/bigquery.ipynb b/notebooks/bigquery.ipynb index 750f167..45b5400 100644 --- a/notebooks/bigquery.ipynb +++ b/notebooks/bigquery.ipynb @@ -15,21 +15,21 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 1/1 [00:00<00:00, 5440.08it/s]\n" + "100%|██████████| 1/1 [00:00<00:00, 10106.76it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "['data transport version ', '2.0.0']\n" + "['data transport version ', '2.0.4']\n" ] } ], @@ -45,7 +45,7 @@ "PRIVATE_KEY = os.environ['BQ_KEY'] #-- location of the service key\n", "DATASET = 'demo'\n", "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", - "bqw = transport.factory.instance(provider=providers.BIGQUERY,dataset=DATASET,table='friends',context='write',private_key=PRIVATE_KEY)\n", + "bqw = transport.get.writer(provider=providers.BIGQUERY,dataset=DATASET,table='friends',private_key=PRIVATE_KEY)\n", "bqw.write(_data,if_exists='replace') #-- default is append\n", "print (['data transport version ', transport.__version__])\n" ] @@ -63,7 +63,8 @@ "\n", "**NOTE**\n", "\n", - "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + "By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n", + "Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**" ] }, { @@ -93,7 +94,7 @@ "from transport import providers\n", "import os\n", "PRIVATE_KEY=os.environ['BQ_KEY']\n", - "pgr = transport.instance(provider=providers.BIGQUERY,dataset='demo',table='friends',private_key=PRIVATE_KEY)\n", + "pgr = transport.get.reader(provider=providers.BIGQUERY,dataset='demo',table='friends',private_key=PRIVATE_KEY)\n", "_df = pgr.read()\n", "_query = 'SELECT COUNT(*) _counts, AVG(age) from demo.friends'\n", "_sdf = pgr.read(sql=_query)\n", @@ -106,35 +107,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", - "\n", - "**NOTE**:\n", + "An **auth-file** is a file that contains database parameters used to access the database. \n", + "For code in shared environments, we recommend \n", "\n", - "The auth_file is intended to be **JSON** formatted" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'dataset': 'demo', 'table': 'friends'}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ + "1. Having the **auth-file** stored on disk \n", + "2. and the location of the file is set to an environment variable.\n", "\n", - "{\n", - " \n", - " \"dataset\":\"demo\",\"table\":\"friends\"\n", - "}" + "To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport" ] }, { diff --git a/notebooks/mongodb.ipynb b/notebooks/mongodb.ipynb index 0554669..fb1532c 100644 --- a/notebooks/mongodb.ipynb +++ b/notebooks/mongodb.ipynb @@ -11,14 +11,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2.0.0\n" + "2.0.4\n" ] } ], @@ -30,7 +30,7 @@ "from transport import providers\n", "import pandas as pd\n", "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", - "mgw = transport.factory.instance(provider=providers.MONGODB,db='demo',collection='friends',context='write')\n", + "mgw = transport.get.writer(provider=providers.MONGODB,db='demo',collection='friends')\n", "mgw.write(_data)\n", "print (transport.__version__)" ] @@ -48,12 +48,13 @@ "\n", "**NOTE**\n", "\n", - "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + "By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n", + "Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -73,7 +74,7 @@ "\n", "import transport\n", "from transport import providers\n", - "mgr = transport.instance(provider=providers.MONGODB,db='foo',collection='friends')\n", + "mgr = transport.get.reader(provider=providers.MONGODB,db='foo',collection='friends')\n", "_df = mgr.read()\n", "PIPELINE = [{\"$group\":{\"_id\":0,\"_counts\":{\"$sum\":1}, \"_mean\":{\"$avg\":\"$age\"}}}]\n", "_sdf = mgr.read(aggregate='friends',pipeline=PIPELINE)\n", @@ -86,41 +87,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "An **auth-file** is a file that contains database parameters used to access the database. \n", + "For code in shared environments, we recommend \n", "\n", - "**NOTE**:\n", + "1. Having the **auth-file** stored on disk \n", + "2. and the location of the file is set to an environment variable.\n", "\n", - "The auth_file is intended to be **JSON** formatted" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'host': 'klingon.io',\n", - " 'port': 27017,\n", - " 'username': 'me',\n", - " 'password': 'foobar',\n", - " 'db': 'foo',\n", - " 'collection': 'friends',\n", - " 'authSource': '',\n", - " 'mechamism': ''}" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "{\n", - " \"host\":\"klingon.io\",\"port\":27017,\"username\":\"me\",\"password\":\"foobar\",\"db\":\"foo\",\"collection\":\"friends\",\n", - " \"authSource\":\"\",\"mechamism\":\"\"\n", - "}" + "To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport" ] }, { diff --git a/notebooks/mssqlserver.ipynb b/notebooks/mssqlserver.ipynb index f2bee85..51827b3 100644 --- a/notebooks/mssqlserver.ipynb +++ b/notebooks/mssqlserver.ipynb @@ -17,17 +17,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['data transport version ', '2.0.0']\n" - ] - } - ], + "outputs": [], "source": [ "#\n", "# Writing to Google Bigquery database\n", @@ -41,7 +33,7 @@ "MSSQL_AUTH_FILE= os.sep.join([AUTH_FOLDER,'mssql.json'])\n", "\n", "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", - "msw = transport.factory.instance(provider=providers.MSSQL,table='friends',context='write',auth_file=MSSQL_AUTH_FILE)\n", + "msw = transport.get.writer(provider=providers.MSSQL,table='friends',auth_file=MSSQL_AUTH_FILE)\n", "msw.write(_data,if_exists='replace') #-- default is append\n", "print (['data transport version ', transport.__version__])\n" ] @@ -59,30 +51,15 @@ "\n", "**NOTE**\n", "\n", - "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + "By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n", + "Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " name age\n", - "0 James Bond 55\n", - "1 Steve Rogers 150\n", - "2 Steve Nyemba 44\n", - "\n", - "--------- STATISTICS ------------\n", - "\n", - " _counts \n", - "0 3 83\n" - ] - } - ], + "outputs": [], "source": [ "\n", "import transport\n", @@ -91,7 +68,7 @@ "AUTH_FOLDER = os.environ['DT_AUTH_FOLDER'] #-- location of the service key\n", "MSSQL_AUTH_FILE= os.sep.join([AUTH_FOLDER,'mssql.json'])\n", "\n", - "msr = transport.instance(provider=providers.MSSQL,table='friends',auth_file=MSSQL_AUTH_FILE)\n", + "msr = transport.get.reader(provider=providers.MSSQL,table='friends',auth_file=MSSQL_AUTH_FILE)\n", "_df = msr.read()\n", "_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n", "_sdf = msr.read(sql=_query)\n", @@ -104,25 +81,31 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "An **auth-file** is a file that contains database parameters used to access the database. \n", + "For code in shared environments, we recommend \n", "\n", - "**NOTE**:\n", + "1. Having the **auth-file** stored on disk \n", + "2. and the location of the file is set to an environment variable.\n", "\n", - "The auth_file is intended to be **JSON** formatted" + "To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'dataset': 'demo', 'table': 'friends'}" + "{'provider': 'sqlserver',\n", + " 'dataset': 'demo',\n", + " 'table': 'friends',\n", + " 'username': '',\n", + " 'password': ''}" ] }, - "execution_count": 3, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -130,10 +113,17 @@ "source": [ "\n", "{\n", - " \n", + " \"provider\":\"sqlserver\",\n", " \"dataset\":\"demo\",\"table\":\"friends\",\"username\":\"\",\"password\":\"\"\n", "}" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/notebooks/mysql.ipynb b/notebooks/mysql.ipynb index a54d46d..4b8fb6b 100644 --- a/notebooks/mysql.ipynb +++ b/notebooks/mysql.ipynb @@ -14,14 +14,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2.0.0\n" + "2.0.4\n" ] } ], @@ -33,7 +33,7 @@ "from transport import providers\n", "import pandas as pd\n", "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", - "myw = transport.factory.instance(provider=providers.MYSQL,database='demo',table='friends',context='write',auth_file=\"/home/steve/auth-mysql.json\")\n", + "myw = transport.get.writer(provider=providers.MYSQL,database='demo',table='friends',auth_file=\"/home/steve/auth-mysql.json\")\n", "myw.write(_data,if_exists='replace') #-- default is append\n", "print (transport.__version__)" ] @@ -51,12 +51,13 @@ "\n", "**NOTE**\n", "\n", - "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + "By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n", + "Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -68,8 +69,8 @@ "1 Steve Rogers 150\n", "2 Steve Nyemba 44\n", "--------- STATISTICS ------------\n", - " _counts avg\n", - "0 3 83.0\n" + " _counts AVG(age)\n", + "0 3 83.0\n" ] } ], @@ -77,7 +78,7 @@ "\n", "import transport\n", "from transport import providers\n", - "myr = transport.instance(provider=providers.POSTGRESQL,database='demo',table='friends',auth_file='/home/steve/auth-mysql.json')\n", + "myr = transport.get.reader(provider=providers.MYSQL,database='demo',table='friends',auth_file='/home/steve/auth-mysql.json')\n", "_df = myr.read()\n", "_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n", "_sdf = myr.read(sql=_query)\n", @@ -90,16 +91,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "An **auth-file** is a file that contains database parameters used to access the database. \n", + "For code in shared environments, we recommend \n", "\n", - "**NOTE**:\n", + "1. Having the **auth-file** stored on disk \n", + "2. and the location of the file is set to an environment variable.\n", "\n", - "The auth_file is intended to be **JSON** formatted" + "To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -109,21 +112,29 @@ " 'port': 3306,\n", " 'username': 'me',\n", " 'password': 'foobar',\n", + " 'provider': 'mysql',\n", " 'database': 'demo',\n", " 'table': 'friends'}" ] }, - "execution_count": 1, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "{\n", - " \"host\":\"klingon.io\",\"port\":3306,\"username\":\"me\",\"password\":\"foobar\",\n", + " \"host\":\"klingon.io\",\"port\":3306,\"username\":\"me\",\"password\":\"foobar\", \"provider\":\"mysql\",\n", " \"database\":\"demo\",\"table\":\"friends\"\n", "}" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/notebooks/postgresql.ipynb b/notebooks/postgresql.ipynb index 5046f4d..85f4322 100644 --- a/notebooks/postgresql.ipynb +++ b/notebooks/postgresql.ipynb @@ -14,14 +14,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2.0.0\n" + "2.0.4\n" ] } ], @@ -33,7 +33,7 @@ "from transport import providers\n", "import pandas as pd\n", "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", - "pgw = transport.factory.instance(provider=providers.POSTGRESQL,database='demo',table='friends',context='write')\n", + "pgw = transport.get.writer(provider=providers.POSTGRESQL,database='demo',table='friends')\n", "pgw.write(_data,if_exists='replace') #-- default is append\n", "print (transport.__version__)" ] @@ -49,14 +49,16 @@ "- Basic read of the designated table (friends) created above\n", "- Execute an aggregate SQL against the table\n", "\n", + "\n", "**NOTE**\n", "\n", - "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + "By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n", + "Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -77,7 +79,7 @@ "\n", "import transport\n", "from transport import providers\n", - "pgr = transport.instance(provider=providers.POSTGRESQL,database='demo',table='friends')\n", + "pgr = transport.get.reader(provider=providers.POSTGRESQL,database='demo',table='friends')\n", "_df = pgr.read()\n", "_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n", "_sdf = pgr.read(sql=_query)\n", @@ -90,16 +92,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "An **auth-file** is a file that contains database parameters used to access the database. \n", + "For code in shared environments, we recommend \n", "\n", - "**NOTE**:\n", + "1. Having the **auth-file** stored on disk \n", + "2. and the location of the file is set to an environment variable.\n", "\n", - "The auth_file is intended to be **JSON** formatted" + "To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -109,18 +113,19 @@ " 'port': 5432,\n", " 'username': 'me',\n", " 'password': 'foobar',\n", + " 'provider': 'postgresql',\n", " 'database': 'demo',\n", " 'table': 'friends'}" ] }, - "execution_count": 1, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "{\n", - " \"host\":\"klingon.io\",\"port\":5432,\"username\":\"me\",\"password\":\"foobar\",\n", + " \"host\":\"klingon.io\",\"port\":5432,\"username\":\"me\",\"password\":\"foobar\", \"provider\":\"postgresql\",\n", " \"database\":\"demo\",\"table\":\"friends\"\n", "}" ] diff --git a/notebooks/sqlite.ipynb b/notebooks/sqlite.ipynb index 5c249de..2a836c5 100644 --- a/notebooks/sqlite.ipynb +++ b/notebooks/sqlite.ipynb @@ -18,7 +18,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2.0.0\n" + "2.0.4\n" ] } ], @@ -30,7 +30,7 @@ "from transport import providers\n", "import pandas as pd\n", "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", - "sqw = transport.factory.instance(provider=providers.SQLITE,database='/home/steve/demo.db3',table='friends',context='write')\n", + "sqw = transport.get.writer(provider=providers.SQLITE,database='/home/steve/demo.db3',table='friends')\n", "sqw.write(_data,if_exists='replace') #-- default is append\n", "print (transport.__version__)" ] @@ -46,9 +46,11 @@ "- Basic read of the designated table (friends) created above\n", "- Execute an aggregate SQL against the table\n", "\n", + "\n", "**NOTE**\n", "\n", - "It is possible to use **transport.factory.instance** or **transport.instance** they are the same. It allows the maintainers to know that we used a factory design pattern." + "By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n", + "Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**" ] }, { @@ -74,10 +76,10 @@ "\n", "import transport\n", "from transport import providers\n", - "pgr = transport.instance(provider=providers.SQLITE,database='/home/steve/demo.db3',table='friends')\n", - "_df = pgr.read()\n", + "sqr = transport.get.reader(provider=providers.SQLITE,database='/home/steve/demo.db3',table='friends')\n", + "_df = sqr.read()\n", "_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n", - "_sdf = pgr.read(sql=_query)\n", + "_sdf = sqr.read(sql=_query)\n", "print (_df)\n", "print ('--------- STATISTICS ------------')\n", "print (_sdf)" @@ -87,11 +89,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The cell bellow show the content of an auth_file, in this case if the dataset/table in question is not to be shared then you can use auth_file with information associated with the parameters.\n", + "An **auth-file** is a file that contains database parameters used to access the database. \n", + "For code in shared environments, we recommend \n", "\n", - "**NOTE**:\n", + "1. Having the **auth-file** stored on disk \n", + "2. and the location of the file is set to an environment variable.\n", "\n", - "The auth_file is intended to be **JSON** formatted. This is an overkill for SQLite ;-)" + "To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport" ] }, { From 2b5c0386100615b1e5270d1b35e65b6551deb2d0 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 10 Jun 2024 02:58:28 -0500 Subject: [PATCH 219/271] documentation ... --- notebooks/etl.ipynb | 188 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 notebooks/etl.ipynb diff --git a/notebooks/etl.ipynb b/notebooks/etl.ipynb new file mode 100644 index 0000000..b274da2 --- /dev/null +++ b/notebooks/etl.ipynb @@ -0,0 +1,188 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Extract Transform Load (ETL) from Code\n", + "\n", + "The example below reads data from an http source (github) and will copy the data to a csv file and to a database. This example illustrates the one-to-many ETL features.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlocation_idaddress_1address_2citystate_provincepostal_codecountry
0112600 Middlefield RoadNaNRedwood CityCA94063US
12224 Second AvenueNaNSan MateoCA94401US
23324 Second AvenueNaNSan MateoCA94403US
34424 Second AvenueNaNSan MateoCA94401US
45524 Second AvenueNaNSan MateoCA94401US
\n", + "
" + ], + "text/plain": [ + " id location_id address_1 address_2 city \\\n", + "0 1 1 2600 Middlefield Road NaN Redwood City \n", + "1 2 2 24 Second Avenue NaN San Mateo \n", + "2 3 3 24 Second Avenue NaN San Mateo \n", + "3 4 4 24 Second Avenue NaN San Mateo \n", + "4 5 5 24 Second Avenue NaN San Mateo \n", + "\n", + " state_province postal_code country \n", + "0 CA 94063 US \n", + "1 CA 94401 US \n", + "2 CA 94403 US \n", + "3 CA 94401 US \n", + "4 CA 94401 US " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#\n", + "# Writing to Google Bigquery database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "import os\n", + "\n", + "#\n", + "#\n", + "source = {\"provider\": \"http\", \"url\": \"https://raw.githubusercontent.com/codeforamerica/ohana-api/master/data/sample-csv/addresses.csv\"}\n", + "target = [{\"provider\": \"files\", \"path\": \"addresses.csv\", \"delimiter\": \",\"}, {\"provider\": \"sqlite\", \"database\": \"sample.db3\", \"table\": \"addresses\"}]\n", + "\n", + "_handler = transport.get.etl (source=source,target=target)\n", + "_data = _handler.read() #-- all etl begins with data being read\n", + "_data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Extract Transform Load (ETL) from CLI\n", + "\n", + "The documentation for this is available at https://healthcareio.the-phi.com/data-transport \"Docs\" -> \"Terminal CLI\"\n", + "\n", + "The entire process is documented including how to generate an ETL configuration file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 6544bf852acdac578b78c03c3e7fb9e2a6b9f392 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 14 Jun 2024 14:14:12 -0500 Subject: [PATCH 220/271] feature: registry for security and enterprise use --- bin/transport | 94 ++++++++++++++++--------------------- transport/__init__.py | 7 +++ transport/other/callback.py | 4 ++ transport/other/console.py | 3 ++ transport/registry.py | 81 ++++++++++++++++++++++++++++++++ 5 files changed, 135 insertions(+), 54 deletions(-) create mode 100644 transport/registry.py diff --git a/bin/transport b/bin/transport index fd5d41b..4f9a7e8 100755 --- a/bin/transport +++ b/bin/transport @@ -13,29 +13,6 @@ The above copyright notice and this permission notice shall be included in all c THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -Usage : - transport help -- will print this page - - transport move [index] - path to the configuration file - optional index within the configuration file - -e.g: configuration file (JSON formatted) - - single source to a single target - - {"source":{"provider":"http","url":"https://cdn.wsform.com/wp-content/uploads/2020/06/agreement.csv"} - "target":{"provider":"sqlite3","path":"transport-demo.sqlite","table":"agreement"} - } - - - single source to multiple targets - { - "source":{"provider":"http","url":"https://cdn.wsform.com/wp-content/uploads/2020/06/agreement.csv"}, - "target":[ - {"provider":"sqlite3","path":"transport-demo.sqlite","table":"agreement}, - {"provider":"mongodb","db":"transport-demo","collection":"agreement"} - ] - } - """ import pandas as pd import numpy as np @@ -53,9 +30,13 @@ import typer from typing_extensions import Annotated from typing import Optional import time +from termcolor import colored app = typer.Typer() - +REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport']) +REGISTRY_FILE= 'transport-registry.json' +CHECK_MARK = ' '.join(['[',colored(u'\u2713', 'green'),']']) +TIMES_MARK= ' '.join(['[',colored(u'\u2717','red'),']']) # @app.command() def help() : print (__doc__) @@ -68,7 +49,7 @@ def wait(jobs): def apply (path:Annotated[str,typer.Argument(help="path of the configuration file")], index:int = typer.Option(help="index of the item of interest, otherwise everything in the file will be processed")): """ - This function applies data transport from one source to one or several others + This function applies data transport ETL feature to read data from one source to write it one or several others """ # _proxy = lambda _object: _object.write(_object.read()) if os.path.exists(path): @@ -124,35 +105,40 @@ def generate (path:Annotated[str,typer.Argument(help="path of the ETL configurat file = open(path,'w') file.write(json.dumps(_config)) file.close() + +@app.command(name="init") +def initregistry (email:Annotated[str,typer.Argument(help="email")], + path:str=typer.Option(default=REGISTRY_PATH,help="path or location of the configuration file"), + override:bool=typer.Option(default=False,help="override existing configuration or not")): + """ + This functiion will initialize the registry and have both application and calling code loading the database parameters by a label + + """ + try: + transport.registry.init(email=email, path=path, override=override) + _msg = f"""{CHECK_MARK} Successfully wrote configuration to {path} from {email}""" + except Exception as e: + _msg = f"{TIMES_MARK} {e}" + print (_msg) + print () +@app.command(name="register") +def register (label:Annotated[str,typer.Argument(help="unique label that will be used to load the parameters of the database")], + auth_file:Annotated[str,typer.Argument(help="path of the auth_file")], + default:bool=typer.Option(default=False,help="set the auth_file as default"), + path:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry file")): + """ + This function will register an auth-file i.e database connection and assign it a label, + Learn more about auth-file at https://healthcareio.the-phi.com/data-transport + """ + try: + transport.registry.set(label=label,auth_file=auth_file, default=default, path=path) + _msg = f"""{CHECK_MARK} Successfully added label "{label}" to data-transport registry""" + except Exception as e: + _msg = f"""{TIMES_MARK} {e}""" + print (_msg) + + pass if __name__ == '__main__' : app() -# # -# # Load information from the file ... -# if 'help' in SYS_ARGS : -# print (__doc__) -# else: -# try: -# _info = json.loads(open(SYS_ARGS['config']).read()) -# if 'index' in SYS_ARGS : -# _index = int(SYS_ARGS['index']) -# _info = [_item for _item in _info if _info.index(_item) == _index] -# pass -# elif 'id' in SYS_ARGS : -# _info = [_item for _item in _info if 'id' in _item and _item['id'] == SYS_ARGS['id']] - -# procs = 1 if 'procs' not in SYS_ARGS else int(SYS_ARGS['procs']) -# jobs = transport.factory.instance(provider='etl',info=_info,procs=procs) -# print ([len(jobs),' Jobs are running']) -# N = len(jobs) -# while jobs : -# x = len(jobs) -# jobs = [_job for _job in jobs if _job.is_alive()] -# if x != len(jobs) : -# print ([len(jobs),'... jobs still running']) -# time.sleep(1) -# print ([N,' Finished running']) -# except Exception as e: - -# print (e) - + diff --git a/transport/__init__.py b/transport/__init__.py index d7d4518..2f97c0f 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -26,8 +26,11 @@ from info import __version__,__author__,__email__,__license__,__app_name__ from transport.iowrapper import IWriter, IReader, IETL from transport.plugins import PluginLoader from transport import providers +import copy +from transport import registry PROVIDERS = {} + def init(): global PROVIDERS for _module in [cloud,sql,nosql,other] : @@ -45,6 +48,10 @@ def instance (**_args): kwargs These are arguments that are provider/vendor specific """ global PROVIDERS + if not registry.isloaded () : + registry.load() if 'path' not in _args else registry.load(_args['path']) + if 'label' in _args : + _info = registry.load(_args['label']) if 'auth_file' in _args: if os.path.exists(_args['auth_file']) : # diff --git a/transport/other/callback.py b/transport/other/callback.py index c56c175..aba2f02 100644 --- a/transport/other/callback.py +++ b/transport/other/callback.py @@ -1,3 +1,7 @@ +""" +This module uses callback architectural style as a writer to enable user-defined code to handle the output of a reader +The intent is to allow users to have control over the output of data to handle things like logging, encryption/decryption and other +""" import queue from threading import Thread, Lock # from transport.common import Reader,Writer diff --git a/transport/other/console.py b/transport/other/console.py index 16f589a..b2f374b 100644 --- a/transport/other/console.py +++ b/transport/other/console.py @@ -1,3 +1,6 @@ +""" +This class uses classback pattern to allow output to be printed to the console (debugging) +""" from . import callback diff --git a/transport/registry.py b/transport/registry.py new file mode 100644 index 0000000..9fe942d --- /dev/null +++ b/transport/registry.py @@ -0,0 +1,81 @@ +import os +import json +from info import __version__ +import copy +import transport + +""" +This class manages data from the registry and allows (read only) +@TODO: add property to the DATA attribute +""" + +REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport']) +REGISTRY_FILE= 'transport-registry.json' + +DATA = {} + +def isloaded (): + return DATA not in [{},None] +def load (_path=REGISTRY_PATH): + global DATA + _path = os.sep.join([_path,REGISTRY_FILE]) + if os.path.exists(_path) : + f = open(_path) + DATA = json.loads(f.read()) + f.close() +def init (email,path=REGISTRY_PATH,override=False): + """ + Initializing the registry and will raise an exception in the advent of an issue + """ + p = '@' in email + q = False if '.' not in email else email.split('.')[-1] in ['edu','com','io','ai'] + if p and q : + _config = {"email":email,'version':__version__} + if not os.path.exists(path): + os.makedirs(path) + filename = os.sep.join([path,REGISTRY_FILE]) + if not os.path.exists(filename) or override == True : + + f = open(filename,'w') + f.write( json.dumps(_config)) + f.close() + # _msg = f"""{CHECK_MARK} Successfully wrote configuration to {path} from {email}""" + + else: + raise Exception (f"""Unable to write configuration, Please check parameters (or help) and try again""") + else: + raise Exception (f"""Invalid Input, {email} is not well formatted, provide an email with adequate format""") + +def get (label='default') : + global DATA + return copy.copy(DATA[label]) if label in DATA else {} + +def set (label, auth_file, default=False,path=REGISTRY_PATH) : + reg_file = os.sep.join([path,REGISTRY_FILE]) + if os.path.exists (auth_file) and os.path.exists(path) and os.path.exists(reg_file): + f = open(auth_file) + _info = json.loads(f.read()) + f.close() + f = open(reg_file) + _config = json.loads(f.read()) + f.close() + + # + # set the proposed label + _object = transport.factory.instance(**_info) + if _object : + _config[label] = _info + if default : + _config['default'] = _info + # + # now we need to write this to the location + f = open(reg_file,'w') + f.write(json.dumps(_config)) + f.close() + else: + _msg = f"""Unable to load file locate at {path},\nLearn how to generate auth-file with wizard found at https://healthcareio.the-phi.com/data-transport""" + pass + else: + pass + pass + From 8edb764d112c9a14b2c4c8a26733920de8513ffa Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 14 Jun 2024 14:16:06 -0500 Subject: [PATCH 221/271] documentation typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 528176d..42bc859 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Mostly data scientists that don't really care about the underlying database and 1. Familiarity with **pandas data-frames** 2. Connectivity **drivers** are included -3. Mining data from various sources +3. Reading/Writing data from various sources 4. Useful for data migrations or **ETL** From b9bc898161f6ee4810dc7c9d30360af2f39a07a1 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 14 Jun 2024 15:30:09 -0500 Subject: [PATCH 222/271] bug fix: registry (more usable) and added to factory method --- bin/transport | 33 ++++++++++++++++++++------------- transport/__init__.py | 29 +++++++++++++++++++++++++---- transport/registry.py | 26 +++++++++++++++++++++----- 3 files changed, 66 insertions(+), 22 deletions(-) diff --git a/bin/transport b/bin/transport index 4f9a7e8..4053c4e 100755 --- a/bin/transport +++ b/bin/transport @@ -47,7 +47,7 @@ def wait(jobs): @app.command(name="apply") def apply (path:Annotated[str,typer.Argument(help="path of the configuration file")], - index:int = typer.Option(help="index of the item of interest, otherwise everything in the file will be processed")): + index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed")): """ This function applies data transport ETL feature to read data from one source to write it one or several others """ @@ -92,19 +92,23 @@ def version(): @app.command() def generate (path:Annotated[str,typer.Argument(help="path of the ETL configuration file template (name included)")]): - """ - This function will generate a configuration template to give a sense of how to create one - """ - _config = [ - { - "source":{"provider":"http","url":"https://raw.githubusercontent.com/codeforamerica/ohana-api/master/data/sample-csv/addresses.csv"}, - "target": + """ + This function will generate a configuration template to give a sense of how to create one + """ + _config = [ + { + "source":{"provider":"http","url":"https://raw.githubusercontent.com/codeforamerica/ohana-api/master/data/sample-csv/addresses.csv"}, + "target": [{"provider":"files","path":"addresses.csv","delimiter":","},{"provider":"sqlite","database":"sample.db3","table":"addresses"}] } ] - file = open(path,'w') - file.write(json.dumps(_config)) - file.close() + file = open(path,'w') + file.write(json.dumps(_config)) + file.close() + print (f"""{CHECK_MARK} Successfully generated a template ETL file at {path}""" ) + print ("""NOTE: Each line (source or target) is the content of an auth-file""") + + @app.command(name="init") def initregistry (email:Annotated[str,typer.Argument(help="email")], @@ -131,8 +135,11 @@ def register (label:Annotated[str,typer.Argument(help="unique label that will be Learn more about auth-file at https://healthcareio.the-phi.com/data-transport """ try: - transport.registry.set(label=label,auth_file=auth_file, default=default, path=path) - _msg = f"""{CHECK_MARK} Successfully added label "{label}" to data-transport registry""" + if transport.registry.exists(path) : + transport.registry.set(label=label,auth_file=auth_file, default=default, path=path) + _msg = f"""{CHECK_MARK} Successfully added label "{label}" to data-transport registry""" + else: + _msg = f"""{TIMES_MARK} Registry is not initialized, please initialize the registry (check help)""" except Exception as e: _msg = f"""{TIMES_MARK} {e}""" print (_msg) diff --git a/transport/__init__.py b/transport/__init__.py index 2f97c0f..b2ea543 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -48,10 +48,16 @@ def instance (**_args): kwargs These are arguments that are provider/vendor specific """ global PROVIDERS - if not registry.isloaded () : - registry.load() if 'path' not in _args else registry.load(_args['path']) - if 'label' in _args : - _info = registry.load(_args['label']) + # if not registry.isloaded () : + # if ('path' in _args and registry.exists(_args['path'] )) or registry.exists(): + # registry.load() if 'path' not in _args else registry.load(_args['path']) + # print ([' GOT IT']) + # if 'label' in _args and registry.isloaded(): + # _info = registry.get(_args['label']) + # if _info : + # # + # _args = dict(_args,**_info) + if 'auth_file' in _args: if os.path.exists(_args['auth_file']) : # @@ -67,6 +73,17 @@ def instance (**_args): else: filename = _args['auth_file'] raise Exception(f" {filename} was not found or is invalid") + if 'provider' not in _args and 'auth_file' not in _args : + if not registry.isloaded () : + if ('path' in _args and registry.exists(_args['path'] )) or registry.exists(): + registry.load() if 'path' not in _args else registry.load(_args['path']) + if 'label' in _args and registry.isloaded(): + _info = registry.get(_args['label']) + print(_info) + if _info : + # + _args = dict(_args,**_info) + if 'provider' in _args and _args['provider'] in PROVIDERS : _info = PROVIDERS[_args['provider']] _module = _info['module'] @@ -110,6 +127,8 @@ class get : """ @staticmethod def reader (**_args): + if not _args : + _args['label'] = 'default' _args['context'] = 'read' return instance(**_args) @staticmethod @@ -117,6 +136,8 @@ class get : """ This function is a wrapper that will return a writer to a database. It disambiguates the interface """ + if not _args : + _args['label'] = 'default' _args['context'] = 'write' return instance(**_args) @staticmethod diff --git a/transport/registry.py b/transport/registry.py index 9fe942d..f487b54 100644 --- a/transport/registry.py +++ b/transport/registry.py @@ -16,11 +16,20 @@ DATA = {} def isloaded (): return DATA not in [{},None] +def exists (path=REGISTRY_PATH) : + """ + This function determines if there is a registry at all + """ + p = os.path.exists(path) + q = os.path.exists( os.sep.join([path,REGISTRY_FILE])) + print ([p,q, os.sep.join([path,REGISTRY_FILE])]) + return p and q def load (_path=REGISTRY_PATH): global DATA - _path = os.sep.join([_path,REGISTRY_FILE]) - if os.path.exists(_path) : - f = open(_path) + + if exists(_path) : + path = os.sep.join([_path,REGISTRY_FILE]) + f = open(path) DATA = json.loads(f.read()) f.close() def init (email,path=REGISTRY_PATH,override=False): @@ -45,12 +54,19 @@ def init (email,path=REGISTRY_PATH,override=False): raise Exception (f"""Unable to write configuration, Please check parameters (or help) and try again""") else: raise Exception (f"""Invalid Input, {email} is not well formatted, provide an email with adequate format""") - +def lookup (label): + global DATA + return label in DATA def get (label='default') : global DATA return copy.copy(DATA[label]) if label in DATA else {} def set (label, auth_file, default=False,path=REGISTRY_PATH) : + """ + This function will add a label (auth-file data) into the registry and can set it as the default + """ + if label == 'default' : + raise Exception ("""Invalid label name provided, please change the label name and use the switch""") reg_file = os.sep.join([path,REGISTRY_FILE]) if os.path.exists (auth_file) and os.path.exists(path) and os.path.exists(reg_file): f = open(auth_file) @@ -73,7 +89,7 @@ def set (label, auth_file, default=False,path=REGISTRY_PATH) : f.write(json.dumps(_config)) f.close() else: - _msg = f"""Unable to load file locate at {path},\nLearn how to generate auth-file with wizard found at https://healthcareio.the-phi.com/data-transport""" + raise Exception( f"""Unable to load file locate at {path},\nLearn how to generate auth-file with wizard found at https://healthcareio.the-phi.com/data-transport""") pass else: pass From 24cdd9f8fe2a44caeb0bb65fc81429230272957b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 14 Jun 2024 19:56:42 -0500 Subject: [PATCH 223/271] bug fix: print statement --- transport/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/registry.py b/transport/registry.py index f487b54..b8d5b16 100644 --- a/transport/registry.py +++ b/transport/registry.py @@ -22,7 +22,7 @@ def exists (path=REGISTRY_PATH) : """ p = os.path.exists(path) q = os.path.exists( os.sep.join([path,REGISTRY_FILE])) - print ([p,q, os.sep.join([path,REGISTRY_FILE])]) + return p and q def load (_path=REGISTRY_PATH): global DATA From 8aa6f2c93de50d6b2b35e7aae83d20e1857de34a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 14 Jun 2024 20:05:12 -0500 Subject: [PATCH 224/271] bug fix: improve handling in registry --- transport/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/__init__.py b/transport/__init__.py index b2ea543..27f2efb 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -79,7 +79,7 @@ def instance (**_args): registry.load() if 'path' not in _args else registry.load(_args['path']) if 'label' in _args and registry.isloaded(): _info = registry.get(_args['label']) - print(_info) + if _info : # _args = dict(_args,**_info) From dde4767e37b2824cddb29cce1857ea5b95e464a6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 14 Jun 2024 20:11:33 -0500 Subject: [PATCH 225/271] new version --- info/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/info/__init__.py b/info/__init__.py index f45fdcd..d34b2f4 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,6 +1,6 @@ __app_name__ = 'data-transport' __author__ = 'The Phi Technology' -__version__= '2.0.4' +__version__= '2.2.0' __email__ = "info@the-phi.com" __license__=f""" Copyright 2010 - 2024, Steve L. Nyemba From c443c6c953b1bf2ca7eae4cbcce6dde0e095c403 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 15 Jun 2024 00:50:53 -0500 Subject: [PATCH 226/271] duckdb support --- setup.py | 4 ++-- transport/sql/__init__.py | 2 +- transport/sql/duckdb.py | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 transport/sql/duckdb.py diff --git a/setup.py b/setup.py index 002feb8..9b46d71 100644 --- a/setup.py +++ b/setup.py @@ -18,8 +18,8 @@ args = { # "packages":["transport","info","transport/sql"]}, "packages": find_packages(include=['info','transport', 'transport.*'])} -args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pyncclient','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql'] +args["keywords"]=['mongodb','duckdb','couchdb','rabbitmq','file','read','write','s3','sqlite'] +args["install_requires"] = ['pyncclient','duckdb-engine','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] # if sys.version_info[0] == 2 : diff --git a/transport/sql/__init__.py b/transport/sql/__init__.py index 9d026bf..b5aaa98 100644 --- a/transport/sql/__init__.py +++ b/transport/sql/__init__.py @@ -3,7 +3,7 @@ This namespace/package wrap the sql functionalities for a certain data-stores - netezza, postgresql, mysql and sqlite - mariadb, redshift (also included) """ -from . import postgresql, mysql, netezza, sqlite, sqlserver +from . import postgresql, mysql, netezza, sqlite, sqlserver, duckdb # diff --git a/transport/sql/duckdb.py b/transport/sql/duckdb.py new file mode 100644 index 0000000..ab82bb2 --- /dev/null +++ b/transport/sql/duckdb.py @@ -0,0 +1,21 @@ +""" +This module implements the handler for duckdb (in memory or not) +""" +from transport.sql.common import Base, BaseReader, BaseWriter + +class Duck : + def __init__(self,**_args): + self.database = _args['database'] + def get_provider(self): + return "duckdb" + + def _get_uri(self,**_args): + return f"""duckdb:///{self.database}""" +class Reader(Duck,BaseReader) : + def __init__(self,**_args): + Duck.__init__(self,**_args) + BaseReader.__init__(self,**_args) +class Writer(Duck,BaseWriter): + def __init__(self,**_args): + Duck.__init__(self,**_args) + BaseWriter.__init__(self,**_args) From 037019c1d79b367daf1d4656d6ec47db8f3a8037 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sat, 15 Jun 2024 01:12:29 -0500 Subject: [PATCH 227/271] bug fix --- transport/providers/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/transport/providers/__init__.py b/transport/providers/__init__.py index 4a583f7..6422d74 100644 --- a/transport/providers/__init__.py +++ b/transport/providers/__init__.py @@ -10,8 +10,11 @@ HTTP='http' BIGQUERY ='bigquery' FILE = 'file' ETL = 'etl' + SQLITE = 'sqlite' SQLITE3= 'sqlite3' +DUCKDB = 'duckdb' + REDSHIFT = 'redshift' NETEZZA = 'netezza' MYSQL = 'mysql' @@ -42,5 +45,6 @@ PGSQL = POSTGRESQL AWS_S3 = 's3' RABBIT = RABBITMQ + # QLISTENER = 'qlistener' \ No newline at end of file From 235a44be66e26c8540e80d489d445e2d4dbd3f58 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 19 Jun 2024 08:38:46 -0500 Subject: [PATCH 228/271] bug fix: registry and parameter handling --- README.md | 10 +++++++++- info/__init__.py | 6 ++++++ transport/__init__.py | 11 ++++++++--- transport/registry.py | 5 +++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 42bc859..dd2beb1 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,14 @@ Within the virtual environment perform the following : pip install git+https://github.com/lnyemba/data-transport.git +## What's new + +Unlike older versions 2.0 and under, we focus on collaborative environments like jupyter-x servers; apache zeppelin: + + 1. Simpler syntax to create reader or writer + 2. auth-file registry that can be referenced using a label + + ## Learn More -We have available notebooks with sample code to read/write against mongodb, couchdb, Netezza, PostgreSQL, Google Bigquery, Databricks, Microsoft SQL Server, MySQL ... Visit [data-transport homepage](https://healthcareio.the-phi.com/data-transport) \ No newline at end of file +We have available notebooks with sample code to read/write against mongodb, couchdb, Netezza, PostgreSQL, Google Bigquery, Databricks, Microsoft SQL Server, MySQL ... Visit [data-transport homepage](https://healthcareio.the-phi.com/data-transport) diff --git a/info/__init__.py b/info/__init__.py index d34b2f4..d84150e 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -12,3 +12,9 @@ The above copyright notice and this permission notice shall be included in all c THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ + +__whatsnew__=f"""version {__version__}, focuses on collaborative environments like jupyter-base servers (apache zeppelin; jupyter notebook, jupyterlab, jupyterhub) + + 1. simpler syntax to create readers/writers + 2. auth-file registry that can be referenced using a label +""" diff --git a/transport/__init__.py b/transport/__init__.py index 27f2efb..6062453 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -22,7 +22,7 @@ from transport import sql, nosql, cloud, other import pandas as pd import json import os -from info import __version__,__author__,__email__,__license__,__app_name__ +from info import __version__,__author__,__email__,__license__,__app_name__,__whatsnew__ from transport.iowrapper import IWriter, IReader, IETL from transport.plugins import PluginLoader from transport import providers @@ -38,7 +38,11 @@ def init(): if _provider_name.startswith('__') or _provider_name == 'common': continue PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__} - +def _getauthfile (path) : + f = open(path) + _object = json.loads(f.read()) + f.close() + return _object def instance (**_args): """ This function returns an object of to read or write from a supported database provider/vendor @@ -82,7 +86,8 @@ def instance (**_args): if _info : # - _args = dict(_args,**_info) + # _args = dict(_args,**_info) + _args = dict(_info,**_args) #-- we can override the registry parameters with our own arguments if 'provider' in _args and _args['provider'] in PROVIDERS : _info = PROVIDERS[_args['provider']] diff --git a/transport/registry.py b/transport/registry.py index b8d5b16..ad94481 100644 --- a/transport/registry.py +++ b/transport/registry.py @@ -10,6 +10,11 @@ This class manages data from the registry and allows (read only) """ REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport']) +# +# This path can be overriden by an environment variable ... +# +if 'DATA_TRANSPORT_REGISTRY_PATH' in os.environ : + REGISTRY_PATH = os.environ['DATA_TRANSPORT_REGISTRY_PATH'] REGISTRY_FILE= 'transport-registry.json' DATA = {} From 2edce85aede5c92b4d70540a09ae838baf6184f0 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 19 Jun 2024 08:40:24 -0500 Subject: [PATCH 229/271] documentation duckdb support --- README.md | 1 + info/__init__.py | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index dd2beb1..bfa67d9 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ Unlike older versions 2.0 and under, we focus on collaborative environments like 1. Simpler syntax to create reader or writer 2. auth-file registry that can be referenced using a label + 3. duckdb support ## Learn More diff --git a/info/__init__.py b/info/__init__.py index d84150e..6379b6c 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -17,4 +17,5 @@ __whatsnew__=f"""version {__version__}, focuses on collaborative environments li 1. simpler syntax to create readers/writers 2. auth-file registry that can be referenced using a label + 3. duckdb support """ From 808378afdbec21144288889b476890143a751dcb Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 19 Jun 2024 09:22:56 -0500 Subject: [PATCH 230/271] bug fix: delegate (new feature) --- transport/__init__.py | 14 ++++++++------ transport/iowrapper.py | 7 +++++++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/transport/__init__.py b/transport/__init__.py index 6062453..16a2467 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -81,13 +81,15 @@ def instance (**_args): if not registry.isloaded () : if ('path' in _args and registry.exists(_args['path'] )) or registry.exists(): registry.load() if 'path' not in _args else registry.load(_args['path']) + _info = {} if 'label' in _args and registry.isloaded(): _info = registry.get(_args['label']) - - if _info : - # - # _args = dict(_args,**_info) - _args = dict(_info,**_args) #-- we can override the registry parameters with our own arguments + else: + _info = registry.get() + if _info : + # + # _args = dict(_args,**_info) + _args = dict(_info,**_args) #-- we can override the registry parameters with our own arguments if 'provider' in _args and _args['provider'] in PROVIDERS : _info = PROVIDERS[_args['provider']] @@ -132,7 +134,7 @@ class get : """ @staticmethod def reader (**_args): - if not _args : + if not _args or 'provider' not in _args: _args['label'] = 'default' _args['context'] = 'read' return instance(**_args) diff --git a/transport/iowrapper.py b/transport/iowrapper.py index d6cba1c..e3abf6c 100644 --- a/transport/iowrapper.py +++ b/transport/iowrapper.py @@ -52,6 +52,13 @@ class IO: if hasattr(self._agent,'apply') : return self._agent.apply(_query) return None + def submit(self,_query): + return self.delegate('submit',_query) + def delegate(self,_name,_query): + if hasattr(self._agent,_name) : + pointer = getattr(self._agent,_name) + return pointer(_query) + return None class IReader(IO): """ This is a wrapper for read functionalities From 6f6fd489821664692d56951e97d7c27ec14bab86 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 25 Jun 2024 11:48:57 -0500 Subject: [PATCH 231/271] bug fixes: environment variable usage --- transport/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/registry.py b/transport/registry.py index ad94481..6764f1b 100644 --- a/transport/registry.py +++ b/transport/registry.py @@ -42,7 +42,7 @@ def init (email,path=REGISTRY_PATH,override=False): Initializing the registry and will raise an exception in the advent of an issue """ p = '@' in email - q = False if '.' not in email else email.split('.')[-1] in ['edu','com','io','ai'] + q = False if '.' not in email else email.split('.')[-1] in ['edu','com','io','ai','org'] if p and q : _config = {"email":email,'version':__version__} if not os.path.exists(path): From 3faee02fa26e0cef920ec9468fb6460dccdff869 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 7 Jul 2024 11:39:29 -0500 Subject: [PATCH 232/271] documentation ... --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index bfa67d9..7d8b414 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,11 @@ Within the virtual environment perform the following : pip install git+https://github.com/lnyemba/data-transport.git +## Features + + - read/write from over a dozen databases + - run ETL jobs seamlessly + - scales and integrates into shared environments like apache zeppelin; jupyterhub; SageMaker; ... ## What's new From 40f9c3930a1964dc2dd75112701ae8cbfffdb3e2 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 10 Jul 2024 15:50:46 -0500 Subject: [PATCH 233/271] bug fixes, using boto3 instead of boto for s3 support --- info/__init__.py | 2 +- setup.py | 2 +- transport/__init__.py | 4 +- transport/cloud/s3.py | 148 ++++++++++++++++++++++-------------------- 4 files changed, 80 insertions(+), 76 deletions(-) diff --git a/info/__init__.py b/info/__init__.py index 6379b6c..04183a9 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,6 +1,6 @@ __app_name__ = 'data-transport' __author__ = 'The Phi Technology' -__version__= '2.2.0' +__version__= '2.2.1' __email__ = "info@the-phi.com" __license__=f""" Copyright 2010 - 2024, Steve L. Nyemba diff --git a/setup.py b/setup.py index 9b46d71..7bb44e8 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ args = { "packages": find_packages(include=['info','transport', 'transport.*'])} args["keywords"]=['mongodb','duckdb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pyncclient','duckdb-engine','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql'] +args["install_requires"] = ['pyncclient','duckdb-engine','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','termcolor','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] # if sys.version_info[0] == 2 : diff --git a/transport/__init__.py b/transport/__init__.py index 16a2467..b934760 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -134,7 +134,7 @@ class get : """ @staticmethod def reader (**_args): - if not _args or 'provider' not in _args: + if not _args or ('provider' not in _args and 'label' not in _args): _args['label'] = 'default' _args['context'] = 'read' return instance(**_args) @@ -143,7 +143,7 @@ class get : """ This function is a wrapper that will return a writer to a database. It disambiguates the interface """ - if not _args : + if not _args or ('provider' not in _args and 'label' not in _args): _args['label'] = 'default' _args['context'] = 'write' return instance(**_args) diff --git a/transport/cloud/s3.py b/transport/cloud/s3.py index 4e230e8..095cfd3 100644 --- a/transport/cloud/s3.py +++ b/transport/cloud/s3.py @@ -5,8 +5,8 @@ Steve L. Nyemba, The Phi Technology LLC This file is a wrapper around s3 bucket provided by AWS for reading and writing content """ from datetime import datetime -import boto -from boto.s3.connection import S3Connection, OrdinaryCallingFormat +import boto3 +# from boto.s3.connection import S3Connection, OrdinaryCallingFormat import numpy as np import botocore from smart_open import smart_open @@ -14,6 +14,7 @@ import sys import json from io import StringIO +import pandas as pd import json class s3 : @@ -29,46 +30,37 @@ class s3 : @param filter filename or filtering elements """ try: - self.s3 = S3Connection(args['access_key'],args['secret_key'],calling_format=OrdinaryCallingFormat()) - self.bucket = self.s3.get_bucket(args['bucket'].strip(),validate=False) if 'bucket' in args else None - # self.path = args['path'] - self.filter = args['filter'] if 'filter' in args else None - self.filename = args['file'] if 'file' in args else None - self.bucket_name = args['bucket'] if 'bucket' in args else None - + self._client = boto3.client('s3',aws_access_key_id=args['access_key'],aws_secret_access_key=args['secret_key'],region_name=args['region']) + self._bucket_name = args['bucket'] + self._file_name = args['file'] + self._region = args['region'] except Exception as e : - self.s3 = None - self.bucket = None print (e) + pass + def has(self,**_args): + _found = None + try: + if 'file' in _args and 'bucket' in _args: + _found = self.meta(**_args) + elif 'bucket' in _args and not 'file' in _args: + _found = self._client.list_objects(Bucket=_args['bucket']) + elif 'file' in _args and not 'bucket' in _args : + _found = self.meta(bucket=self._bucket_name,file = _args['file']) + except Exception as e: + _found = None + pass + return type(_found) == dict def meta(self,**args): """ + This function will return information either about the file in a given bucket :name name of the bucket """ - info = self.list(**args) - [item.open() for item in info] - return [{"name":item.name,"size":item.size} for item in info] - def list(self,**args): - """ - This function will list the content of a bucket, the bucket must be provided by the name - :name name of the bucket - """ - return list(self.s3.get_bucket(args['name']).list()) - - - def buckets(self): - # - # This function will return all buckets, not sure why but it should be used cautiously - # based on why the s3 infrastructure is used - # - return [item.name for item in self.s3.get_all_buckets()] - - # def buckets(self): - pass - # """ - # This function is a wrapper around the bucket list of buckets for s3 - # """ - # return self.s3.get_all_buckets() - + _bucket = self._bucket_name if 'bucket' not in args else args['bucket'] + _file = self._file_name if 'file' not in args else args['file'] + _data = self._client.get_object(Bucket=_bucket,Key=_file) + return _data['ResponseMetadata'] + def close(self): + self._client.close() class Reader(s3) : """ @@ -77,51 +69,63 @@ class Reader(s3) : - stream content if file is Not None @TODO: support read from all buckets, think about it """ - def __init__(self,**args) : - s3.__init__(self,**args) - def files(self): - r = [] - try: - return [item.name for item in self.bucket if item.size > 0] - except Exception as e: - pass - return r - def stream(self,limit=-1): + def __init__(self,**_args) : + super().__init__(**_args) + + def _stream(self,**_args): """ At this point we should stream a file from a given bucket """ - key = self.bucket.get_key(self.filename.strip()) - if key is None : - yield None + _object = self._client.get_object(Bucket=_args['bucket'],Key=_args['file']) + _stream = None + try: + _stream = _object['Body'].read() + except Exception as e: + pass + if not _stream : + return None + if _object['ContentType'] in ['text/csv'] : + return pd.read_csv(StringIO(str(_stream).replace("\\n","\n").replace("\\r","").replace("\'",""))) else: - count = 0 - with smart_open(key) as remote_file: - for line in remote_file: - if count == limit and limit > 0 : - break - yield line - count += 1 + return _stream + def read(self,**args) : - if self.filename is None : - # - # returning the list of files because no one file was specified. - return self.files() - else: - limit = args['size'] if 'size' in args else -1 - return self.stream(limit) + + _name = self._file_name if 'file' not in args else args['file'] + _bucket = args['bucket'] if 'bucket' in args else self._bucket_name + return self._stream(bucket=_bucket,file=_name) + class Writer(s3) : - - def __init__(self,**args) : - s3.__init__(self,**args) - def mkdir(self,name): + """ + + """ + def __init__(self,**_args) : + super().__init__(**_args) + # + # + if not self.has(bucket=self._bucket_name) : + self.make_bucket(self._bucket_name) + def make_bucket(self,bucket_name): """ - This function will create a folder in a bucket + This function will create a folder in a bucket,It is best that the bucket is organized as a namespace :name name of the folder """ - self.s3.put_object(Bucket=self.bucket_name,key=(name+'/')) - def write(self,content): - file = StringIO(content.decode("utf8")) - self.s3.upload_fileobj(file,self.bucket_name,self.filename) + + self._client.create_bucket(Bucket=bucket_name,CreateBucketConfiguration={'LocationConstraint': self._region}) + def write(self,_data,**_args): + """ + This function will write the data to the s3 bucket, files can be either csv, or json formatted files + """ + if type(_data) == pd.DataFrame : + _stream = _data.to_csv(index=False) + elif type(_data) == dict : + _stream = json.dumps(_data) + else: + _stream = _data + file = StringIO(_stream) + bucket = self._bucket_name if 'bucket' not in _args else _args['bucket'] + file_name = self._file_name if 'file' not in _args else _args['file'] + self._client.put_object(Bucket=bucket, Key = file_name, Body=_stream) pass From 9dba5daecdca58a20bbcd28a0927e496afcca641 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 10 Jul 2024 17:26:11 -0500 Subject: [PATCH 234/271] bug fix, TODO: figure out how to parse types --- transport/cloud/s3.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/transport/cloud/s3.py b/transport/cloud/s3.py index 095cfd3..4b4515d 100644 --- a/transport/cloud/s3.py +++ b/transport/cloud/s3.py @@ -117,15 +117,18 @@ class Writer(s3) : """ This function will write the data to the s3 bucket, files can be either csv, or json formatted files """ + content = 'text/plain' if type(_data) == pd.DataFrame : _stream = _data.to_csv(index=False) + content = 'text/csv' elif type(_data) == dict : _stream = json.dumps(_data) + content = 'application/json' else: _stream = _data file = StringIO(_stream) bucket = self._bucket_name if 'bucket' not in _args else _args['bucket'] file_name = self._file_name if 'file' not in _args else _args['file'] - self._client.put_object(Bucket=bucket, Key = file_name, Body=_stream) + self._client.put_object(Bucket=bucket, Key = file_name, Body=_stream,ContentType=content) pass From 63666e95ce35c3e73ac6ccd3cc2c562d404d963a Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 10 Jul 2024 17:27:49 -0500 Subject: [PATCH 235/271] bug fix, TODO: figure out how to parse types --- transport/cloud/s3.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/transport/cloud/s3.py b/transport/cloud/s3.py index 4b4515d..81e2e69 100644 --- a/transport/cloud/s3.py +++ b/transport/cloud/s3.py @@ -3,7 +3,10 @@ Data Transport - 1.0 Steve L. Nyemba, The Phi Technology LLC This file is a wrapper around s3 bucket provided by AWS for reading and writing content +TODO: + - Address limitations that will properly read csv if it is stored with content type text/csv """ + from datetime import datetime import boto3 # from boto.s3.connection import S3Connection, OrdinaryCallingFormat From 955369fdd8e0671cacf4fd641bf092d20c54daaa Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 10 Jul 2024 17:33:34 -0500 Subject: [PATCH 236/271] aws s3 notebook, brief example --- notebooks/s3.ipynb | 131 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 notebooks/s3.ipynb diff --git a/notebooks/s3.ipynb b/notebooks/s3.ipynb new file mode 100644 index 0000000..1009120 --- /dev/null +++ b/notebooks/s3.ipynb @@ -0,0 +1,131 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing to AWS S3\n", + "\n", + "We have setup our demo environment with the label **aws** passed to reference our s3 access_key and secret_key and file. In the cell below we will write the data to our aws s3 bucket named **com.phi.demo**" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.2.1\n" + ] + } + ], + "source": [ + "#\n", + "# Writing to mongodb database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "mgw = transport.get.writer(label='aws',file='friends.csv',bucket='com.phi.demo')\n", + "mgw.write(_data)\n", + "print (transport.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from AWS S3\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age within a mongodb pipeline. The code in the background executes an aggregation using\n", + "\n", + "- Basic read of the designated file **friends.csv**\n", + "- Compute average age using standard pandas functions\n", + "\n", + "**NOTE**\n", + "\n", + "By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n", + "Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " bname age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "2 Steve Nyemba 44\n", + "--------- STATISTICS ------------\n", + "83.0\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "\n", + "def cast(stream) :\n", + " print (stream)\n", + " return pd.DataFrame(str(stream))\n", + "mgr = transport.get.reader(label='aws', bucket='com.phi.demo',file='friends.csv')\n", + "_df = mgr.read()\n", + "print (_df)\n", + "print ('--------- STATISTICS ------------')\n", + "print (_df.age.mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An **auth-file** is a file that contains database parameters used to access the database. \n", + "For code in shared environments, we recommend \n", + "\n", + "1. Having the **auth-file** stored on disk \n", + "2. and the location of the file is set to an environment variable.\n", + "\n", + "To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From a7c72391e89c64f5f9be9c80f750cd5437a3c847 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 26 Jul 2024 17:49:39 -0500 Subject: [PATCH 237/271] s3 notebook - code as documentation --- notebooks/s3.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/notebooks/s3.ipynb b/notebooks/s3.ipynb index 1009120..c3952cb 100644 --- a/notebooks/s3.ipynb +++ b/notebooks/s3.ipynb @@ -6,12 +6,12 @@ "source": [ "#### Writing to AWS S3\n", "\n", - "We have setup our demo environment with the label **aws** passed to reference our s3 access_key and secret_key and file. In the cell below we will write the data to our aws s3 bucket named **com.phi.demo**" + "We have setup our demo environment with the label **aws** passed to reference our s3 access_key and secret_key and file (called friends.csv). In the cell below we will write the data to our aws s3 bucket named **com.phi.demo**" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -30,7 +30,7 @@ "from transport import providers\n", "import pandas as pd\n", "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", - "mgw = transport.get.writer(label='aws',file='friends.csv',bucket='com.phi.demo')\n", + "mgw = transport.get.writer(label='aws')\n", "mgw.write(_data)\n", "print (transport.__version__)" ] @@ -54,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -79,7 +79,7 @@ "def cast(stream) :\n", " print (stream)\n", " return pd.DataFrame(str(stream))\n", - "mgr = transport.get.reader(label='aws', bucket='com.phi.demo',file='friends.csv')\n", + "mgr = transport.get.reader(label='aws')\n", "_df = mgr.read()\n", "print (_df)\n", "print ('--------- STATISTICS ------------')\n", From 34db729ad48b62ab4d3888994aedc23fa561d51d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 6 Sep 2024 10:59:51 -0500 Subject: [PATCH 238/271] bug fixes: mongodb console --- info/__init__.py | 2 +- transport/nosql/mongodb.py | 2 ++ transport/other/__init__.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/info/__init__.py b/info/__init__.py index 04183a9..04adfdf 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,6 +1,6 @@ __app_name__ = 'data-transport' __author__ = 'The Phi Technology' -__version__= '2.2.1' +__version__= '2.2.2' __email__ = "info@the-phi.com" __license__=f""" Copyright 2010 - 2024, Steve L. Nyemba diff --git a/transport/nosql/mongodb.py b/transport/nosql/mongodb.py index 7c5b8b2..503f821 100644 --- a/transport/nosql/mongodb.py +++ b/transport/nosql/mongodb.py @@ -33,6 +33,8 @@ class Mongo : :password password for current user """ self.host = 'localhost' if 'host' not in args else args['host'] + if ':' not in self.host and 'port' in args : + self.host = ':'.join([self.host,str(args['port'])]) self.mechanism= 'SCRAM-SHA-256' if 'mechanism' not in args else args['mechanism'] # authSource=(args['authSource'] if 'authSource' in args else self.dbname) self._lock = False if 'lock' not in args else args['lock'] diff --git a/transport/other/__init__.py b/transport/other/__init__.py index ea26d80..77d8e2f 100644 --- a/transport/other/__init__.py +++ b/transport/other/__init__.py @@ -1 +1 @@ -from . import files, http, rabbitmq, callback, files \ No newline at end of file +from . import files, http, rabbitmq, callback, files From e9aab3b034c214d2a7d5c8187cc722a110868cd4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 13 Sep 2024 10:15:47 -0500 Subject: [PATCH 239/271] bug fix, duckdb in-memory handling --- info/__init__.py | 2 +- notebooks/plugins.ipynb | 149 ++++++++++++++++++++++++++++++++++++++++ transport/duck.py | 19 +++++ 3 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 notebooks/plugins.ipynb create mode 100644 transport/duck.py diff --git a/info/__init__.py b/info/__init__.py index 04adfdf..a97b1ab 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,6 +1,6 @@ __app_name__ = 'data-transport' __author__ = 'The Phi Technology' -__version__= '2.2.2' +__version__= '2.2.4' __email__ = "info@the-phi.com" __license__=f""" Copyright 2010 - 2024, Steve L. Nyemba diff --git a/notebooks/plugins.ipynb b/notebooks/plugins.ipynb new file mode 100644 index 0000000..a5f7abb --- /dev/null +++ b/notebooks/plugins.ipynb @@ -0,0 +1,149 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing data-transport plugins\n", + "\n", + "The data-transport plugins are designed to automate pre/post processing i.e\n", + "\n", + " - Read -> Post processing\n", + " - Write-> Pre processing\n", + " \n", + "In this example we will assume, data and write both pre/post processing to any supported infrastructure. We will equally show how to specify the plugins within a configuration file" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# Writing to Google Bigquery database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "import os\n", + "import shutil\n", + "#\n", + "#\n", + "\n", + "DATABASE = '/home/steve/tmp/demo.db3'\n", + "if os.path.exists(DATABASE) :\n", + " os.remove(DATABASE)\n", + "#\n", + "# \n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "litew = transport.get.writer(provider=providers.SQLITE,database=DATABASE)\n", + "litew.write(_data,table='friends')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from SQLite\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age from a plugin function we will write. \n", + "\n", + "- Basic read of the designated table (friends) created above\n", + "- Read with pipeline functions defined in code\n", + "\n", + "**NOTE**\n", + "\n", + "It is possible to use **transport.factory.instance** or **transport.instance** or **transport.get.<[reader|writer]>** they are the same. It allows the maintainers to know that we used a factory design pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "2 Steve Nyemba 44\n", + "\n", + "\n", + " name age autoinc\n", + "0 James Bond 5.5 0\n", + "1 Steve Rogers 15.0 1\n", + "2 Steve Nyemba 4.4 2\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "import os\n", + "import numpy as np\n", + "def _autoincrement (_data,**kwargs) :\n", + " \"\"\"\n", + " This function will add an autoincrement field to the table\n", + " \"\"\"\n", + " _data['autoinc'] = np.arange(_data.shape[0])\n", + " \n", + " return _data\n", + "def reduce(_data,**_args) :\n", + " \"\"\"\n", + " This function will reduce the age of the data frame\n", + " \"\"\"\n", + " _data.age /= 10\n", + " return _data\n", + "reader = transport.get.reader(provider=providers.SQLITE,database=DATABASE,table='friends')\n", + "#\n", + "# basic read of the data created in the first cell\n", + "_df = reader.read()\n", + "print (_df)\n", + "print ()\n", + "print()\n", + "#\n", + "# read of the data with pipeline function provided to alter the database\n", + "print (reader.read(pipeline=[_autoincrement,reduce]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The parameters for instianciating a transport object (reader or writer) can be found at [data-transport home](https://healthcareio.the-phi.com/data-transport)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/transport/duck.py b/transport/duck.py new file mode 100644 index 0000000..7d580c9 --- /dev/null +++ b/transport/duck.py @@ -0,0 +1,19 @@ +""" +This file will be intended to handle duckdb database +""" + +import duckdb +from transport.common import Reader,Writer + +class Duck(Reader): + def __init__(self,**_args): + super().__init__(**_args) + self._path = None if 'path' not in _args else _args['path'] + self._handler = duckdb.connect() if not self._path else duckdb.connect(self._path) + + +class DuckReader(Duck) : + def __init__(self,**_args): + super().__init__(**_args) + def read(self,**_args) : + pass \ No newline at end of file From 2df926da12228fbe346a4071184ac40ad22926a0 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 19 Sep 2024 11:15:13 -0500 Subject: [PATCH 240/271] new provider console and bug fixes with applied commands --- info/__init__.py | 2 +- transport/other/__init__.py | 2 +- transport/sql/common.py | 12 +++++++++++- transport/sql/duckdb.py | 5 ++++- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/info/__init__.py b/info/__init__.py index a97b1ab..3eded86 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,6 +1,6 @@ __app_name__ = 'data-transport' __author__ = 'The Phi Technology' -__version__= '2.2.4' +__version__= '2.2.6' __email__ = "info@the-phi.com" __license__=f""" Copyright 2010 - 2024, Steve L. Nyemba diff --git a/transport/other/__init__.py b/transport/other/__init__.py index 77d8e2f..878b06a 100644 --- a/transport/other/__init__.py +++ b/transport/other/__init__.py @@ -1 +1 @@ -from . import files, http, rabbitmq, callback, files +from . import files, http, rabbitmq, callback, files, console diff --git a/transport/sql/common.py b/transport/sql/common.py index 4c9d4a7..0a55ed7 100644 --- a/transport/sql/common.py +++ b/transport/sql/common.py @@ -3,6 +3,8 @@ This file encapsulates common operations associated with SQL databases via SQLAl """ import sqlalchemy as sqa +from sqlalchemy import text + import pandas as pd class Base: @@ -56,7 +58,15 @@ class Base: @TODO: Execution of stored procedures """ - return pd.read_sql(sql,self._engine) if sql.lower().startswith('select') or sql.lower().startswith('with') else None + if sql.lower().startswith('select') or sql.lower().startswith('with') : + + return pd.read_sql(sql,self._engine) + else: + _handler = self._engine.connect() + _handler.execute(text(sql)) + _handler.commit () + _handler.close() + return None class SQLBase(Base): def __init__(self,**_args): diff --git a/transport/sql/duckdb.py b/transport/sql/duckdb.py index ab82bb2..06f66e5 100644 --- a/transport/sql/duckdb.py +++ b/transport/sql/duckdb.py @@ -5,7 +5,10 @@ from transport.sql.common import Base, BaseReader, BaseWriter class Duck : def __init__(self,**_args): - self.database = _args['database'] + # + # duckdb with none as database will operate as an in-memory database + # + self.database = _args['database'] if 'database' in _args else '' def get_provider(self): return "duckdb" From d0e655e7e3ddbb4e40192e21688a7cf2fd90ef00 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 29 Oct 2024 09:48:59 -0500 Subject: [PATCH 241/271] update, community edition baseline --- transport/registry.py | 172 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 165 insertions(+), 7 deletions(-) diff --git a/transport/registry.py b/transport/registry.py index 6764f1b..f3dc8ac 100644 --- a/transport/registry.py +++ b/transport/registry.py @@ -3,6 +3,10 @@ import json from info import __version__ import copy import transport +import importlib +import importlib.util +import shutil + """ This class manages data from the registry and allows (read only) @@ -16,28 +20,182 @@ REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport']) if 'DATA_TRANSPORT_REGISTRY_PATH' in os.environ : REGISTRY_PATH = os.environ['DATA_TRANSPORT_REGISTRY_PATH'] REGISTRY_FILE= 'transport-registry.json' - DATA = {} +class plugins: + # + # This is a utility function that should enable management of plugins-registry + # The class allows to add/remove elements + # + # @TODO: add read/write properties to the class (better design practice) + # + _data = {} + FOLDER = os.sep.join([REGISTRY_PATH,'plugins']) + CODE = os.sep.join([REGISTRY_PATH,'plugins','code']) + FILE = os.sep.join([REGISTRY_PATH,'plugin-registry.json']) + @staticmethod + def init(): + + if not os.path.exists(plugins.FOLDER) : + os.makedirs(plugins.FOLDER) + if not os.path.exists(plugins.CODE): + os.makedirs(plugins.CODE) + if not os.path.exists(plugins.FILE): + f = open(plugins.FILE,'w') + f.write("{}") + f.close() + plugins._read() #-- will load data as a side effect + + @staticmethod + def copy (path) : + + shutil.copy2(path,plugins.CODE) + @staticmethod + def _read (): + f = open(plugins.FILE) + try: + _data = json.loads(f.read()) + f.close() + except Exception as e: + print (f"Corrupted registry, resetting ...") + _data = {} + plugins._write(_data) + + plugins._data = _data + @staticmethod + def _write (_data): + f = open(plugins.FILE,'w') + f.write(json.dumps(_data)) + f.close() + plugins._data = _data + + @staticmethod + def inspect (_path): + _names = [] + + if os.path.exists(_path) : + _filename = _path.split(os.sep)[-1] + spec = importlib.util.spec_from_file_location(_filename, _path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + # _names = [{'name':getattr(getattr(module,_name),'name'),'pointer':getattr(module,_name)} for _name in dir(module) if type( getattr(module,_name)).__name__ == 'function'] + for _name in dir(module) : + _pointer = getattr(module,_name) + if hasattr(_pointer,'transport') : + _item = {'real_name':_name,'name':getattr(_pointer,'name'),'pointer':_pointer,'version':getattr(_pointer,'version')} + _names.append(_item) + + + return _names + @staticmethod + def add (alias,path): + """ + Add overwrite the registry entries + """ + _names = plugins.inspect (path) + _log = [] + + if _names : + # + # We should make sure we have all the plugins with the attributes (transport,name) set + _names = [_item for _item in _names if hasattr(_item['pointer'],'transport') ] + if _names : + plugins.copy(path) + _content = [] + + for _item in _names : + _key = '@'.join([alias,_item['name']]) + _log.append(_item['name']) + # + # Let us update the registry + # + plugins.update(alias,path,_log) + return _log + + @staticmethod + def update (alias,path,_log) : + """ + updating the registry entries of the plugins (management data) + """ + # f = open(plugins.FILE) + # _data = json.loads(f.read()) + # f.close() + _data = plugins._data + # _log = plugins.add(alias,path) + + if _log : + _data[alias] = {'content':_log,'name':path.split(os.sep)[-1]} + plugins._write(_data) #-- will update data as a side effect + + return _log + @staticmethod + def get(**_args) : + # f = open(plugins.FILE) + # _data = json.loads(f.read()) + # f.close() + # if 'key' in _args : + # alias,name = _args['key'].split('.') if '.' in _args['key'] else _args['key'].split('@') + # else : + # alias = _args['alias'] + # name = _args['name'] + + # if alias in _data : + + # _path = os.sep.join([plugins.CODE,_data[alias]['name']]) + # _item = [_item for _item in plugins.inspect(_path) if name == _item['name']] + + # _item = _item[0] if _item else None + # if _item : + + # return _item['pointer'] + # return None + _item = plugins.has(**_args) + return _item['pointer'] if _item else None + + @staticmethod + def has (**_args): + f = open(plugins.FILE) + _data = json.loads(f.read()) + f.close() + if 'key' in _args : + alias,name = _args['key'].split('.') if '.' in _args['key'] else _args['key'].split('@') + else : + alias = _args['alias'] + name = _args['name'] + + if alias in _data : + + _path = os.sep.join([plugins.CODE,_data[alias]['name']]) + _item = [_item for _item in plugins.inspect(_path) if name == _item['name']] + + _item = _item[0] if _item else None + if _item : + + return copy.copy(_item) + return None + @staticmethod + def synch(): + pass def isloaded (): return DATA not in [{},None] -def exists (path=REGISTRY_PATH) : +def exists (path=REGISTRY_PATH,_file=REGISTRY_FILE) : """ This function determines if there is a registry at all """ p = os.path.exists(path) - q = os.path.exists( os.sep.join([path,REGISTRY_FILE])) + q = os.path.exists( os.sep.join([path,_file])) return p and q -def load (_path=REGISTRY_PATH): +def load (_path=REGISTRY_PATH,_file=REGISTRY_FILE): global DATA if exists(_path) : - path = os.sep.join([_path,REGISTRY_FILE]) + path = os.sep.join([_path,_file]) f = open(path) DATA = json.loads(f.read()) f.close() -def init (email,path=REGISTRY_PATH,override=False): +def init (email,path=REGISTRY_PATH,override=False,_file=REGISTRY_FILE): """ Initializing the registry and will raise an exception in the advent of an issue """ @@ -47,7 +205,7 @@ def init (email,path=REGISTRY_PATH,override=False): _config = {"email":email,'version':__version__} if not os.path.exists(path): os.makedirs(path) - filename = os.sep.join([path,REGISTRY_FILE]) + filename = os.sep.join([path,_file]) if not os.path.exists(filename) or override == True : f = open(filename,'w') From 2a72de4cd6a9acc40f66ac16557c4eac9094d048 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 31 Dec 2024 12:20:22 -0600 Subject: [PATCH 242/271] bug fixes: registry and handling cli parameters as well as adding warehousing --- bin/transport | 114 ++++++++++++++++++------ setup.py | 2 +- transport/__init__.py | 91 +++++++++++++------ transport/iowrapper.py | 20 +++-- transport/plugins/__init__.py | 109 ++++++++++++++--------- transport/providers/__init__.py | 8 +- transport/registry.py | 2 + transport/warehouse/__init__.py | 7 ++ transport/warehouse/drill.py | 55 ++++++++++++ transport/warehouse/iceberg.py | 151 ++++++++++++++++++++++++++++++++ 10 files changed, 458 insertions(+), 101 deletions(-) create mode 100644 transport/warehouse/__init__.py create mode 100644 transport/warehouse/drill.py create mode 100644 transport/warehouse/iceberg.py diff --git a/bin/transport b/bin/transport index 4053c4e..d2072f7 100755 --- a/bin/transport +++ b/bin/transport @@ -24,19 +24,25 @@ from multiprocessing import Process import os import transport -from transport import etl +# from transport import etl +from transport.iowrapper import IETL # from transport import providers import typer from typing_extensions import Annotated from typing import Optional import time from termcolor import colored +from enum import Enum +from rich import print app = typer.Typer() +app_x = typer.Typer() +app_i = typer.Typer() +app_r = typer.Typer() REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport']) REGISTRY_FILE= 'transport-registry.json' -CHECK_MARK = ' '.join(['[',colored(u'\u2713', 'green'),']']) -TIMES_MARK= ' '.join(['[',colored(u'\u2717','red'),']']) +CHECK_MARK = '[ [green]\u2713[/green] ]' #' '.join(['[',colored(u'\u2713', 'green'),']']) +TIMES_MARK= '[ [red]\u2717[/red] ]' #' '.join(['[',colored(u'\u2717','red'),']']) # @app.command() def help() : print (__doc__) @@ -44,10 +50,15 @@ def wait(jobs): while jobs : jobs = [thread for thread in jobs if thread.is_alive()] time.sleep(1) +def wait (jobs): + while jobs : + jobs = [pthread for pthread in jobs if pthread.is_alive()] -@app.command(name="apply") +@app.command(name="etl") def apply (path:Annotated[str,typer.Argument(help="path of the configuration file")], - index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed")): + index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed"), + batch:int = typer.Option(default=5, help="The number of parallel processes to run at once") + ): """ This function applies data transport ETL feature to read data from one source to write it one or several others """ @@ -56,23 +67,34 @@ def apply (path:Annotated[str,typer.Argument(help="path of the configuration fil file = open(path) _config = json.loads (file.read() ) file.close() - if index : + if index is not None: _config = [_config[ int(index)]] - jobs = [] + jobs = [] for _args in _config : - pthread = etl.instance(**_args) #-- automatically starts the process + # pthread = etl.instance(**_args) #-- automatically starts the process + def bootup (): + _worker = IETL(**_args) + _worker.run() + pthread = Process(target=bootup) + pthread.start() jobs.append(pthread) + if len(jobs) == batch : + wait(jobs) + jobs = [] + + if jobs : + wait (jobs) # - # @TODO: Log the number of processes started and estimated time - while jobs : - jobs = [pthread for pthread in jobs if pthread.is_alive()] - time.sleep(1) + # @TODO: Log the number of processes started and estfrom transport impfrom transport impimated time + # while jobs : + # jobs = [pthread for pthread in jobs if pthread.is_alive()] + # time.sleep(1) # # @TODO: Log the job termination here ... -@app.command(name="providers") +@app_i.command(name="supported") def supported (format:Annotated[str,typer.Argument(help="format of the output, supported formats are (list,table,json)")]="table") : """ - This function will print supported providers/vendors and their associated classifications + This function will print supported database technologies """ _df = (transport.supported()) if format in ['list','json'] : @@ -81,13 +103,14 @@ def supported (format:Annotated[str,typer.Argument(help="format of the output, s print (_df) print () -@app.command() -def version(): +@app_i.command(name="license") +def info(): """ This function will display version and license information """ - print (transport.__app_name__,'version ',transport.__version__) + print (f'[bold] {transport.__app_name__} ,version {transport.__version__}[/bold]') + print () print (transport.__license__) @app.command() @@ -99,18 +122,18 @@ def generate (path:Annotated[str,typer.Argument(help="path of the ETL configurat { "source":{"provider":"http","url":"https://raw.githubusercontent.com/codeforamerica/ohana-api/master/data/sample-csv/addresses.csv"}, "target": - [{"provider":"files","path":"addresses.csv","delimiter":","},{"provider":"sqlite","database":"sample.db3","table":"addresses"}] + [{"provider":"files","path":"addresses.csv","delimiter":","},{"provider":"sqlite3","database":"sample.db3","table":"addresses"}] } ] file = open(path,'w') file.write(json.dumps(_config)) file.close() - print (f"""{CHECK_MARK} Successfully generated a template ETL file at {path}""" ) + print (f"""{CHECK_MARK} Successfully generated a template ETL file at [bold]{path}[/bold]""" ) print ("""NOTE: Each line (source or target) is the content of an auth-file""") -@app.command(name="init") +@app_r.command(name="reset") def initregistry (email:Annotated[str,typer.Argument(help="email")], path:str=typer.Option(default=REGISTRY_PATH,help="path or location of the configuration file"), override:bool=typer.Option(default=False,help="override existing configuration or not")): @@ -120,24 +143,24 @@ def initregistry (email:Annotated[str,typer.Argument(help="email")], """ try: transport.registry.init(email=email, path=path, override=override) - _msg = f"""{CHECK_MARK} Successfully wrote configuration to {path} from {email}""" + _msg = f"""{CHECK_MARK} Successfully wrote configuration to [bold]{path}[/bold] from [bold]{email}[/bold]""" except Exception as e: _msg = f"{TIMES_MARK} {e}" print (_msg) print () -@app.command(name="register") +@app_r.command(name="add") def register (label:Annotated[str,typer.Argument(help="unique label that will be used to load the parameters of the database")], auth_file:Annotated[str,typer.Argument(help="path of the auth_file")], default:bool=typer.Option(default=False,help="set the auth_file as default"), path:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry file")): """ - This function will register an auth-file i.e database connection and assign it a label, - Learn more about auth-file at https://healthcareio.the-phi.com/data-transport + This function add a database label for a given auth-file. which allows access to the database using a label of your choice. + """ try: if transport.registry.exists(path) : transport.registry.set(label=label,auth_file=auth_file, default=default, path=path) - _msg = f"""{CHECK_MARK} Successfully added label "{label}" to data-transport registry""" + _msg = f"""{CHECK_MARK} Successfully added label [bold]"{label}"[/bold] to data-transport registry""" else: _msg = f"""{TIMES_MARK} Registry is not initialized, please initialize the registry (check help)""" except Exception as e: @@ -145,6 +168,47 @@ def register (label:Annotated[str,typer.Argument(help="unique label that will be print (_msg) pass +@app_x.command(name='add') +def register_plugs ( + alias:Annotated[str,typer.Argument(help="unique alias fo the file being registered")], + path:Annotated[str,typer.Argument(help="path of the python file, that contains functions")] + ): + """ + This function will register a file and the functions within will be refrences . in a configuration file + """ + transport.registry.plugins.init() + _log = transport.registry.plugins.add(alias,path) + _mark = TIMES_MARK if not _log else CHECK_MARK + _msg = f"""Could NOT add the [bold]{alias}[/bold]to the registry""" if not _log else f""" successfully added {alias}, {len(_log)} functions added""" + print (f"""{_mark} {_msg}""") +@app_x.command(name="list") +def registry_list (): + + transport.registry.plugins.init() + _d = [] + for _alias in transport.registry.plugins._data : + _data = transport.registry.plugins._data[_alias] + _d += [{'alias':_alias,"plugin-count":len(_data['content']),'e.g':'@'.join([_alias,_data['content'][0]]),'plugins':json.dumps(_data['content'])}] + if _d: + print (pd.DataFrame(_d)) + else: + print (f"""{TIMES_MARK}, Plugin registry is not available or needs initialization""") + +@app_x.command(name="test") +def registry_test (key): + """ + This function allows to test syntax for a plugin i.e in terms of alias@function + """ + _item = transport.registry.plugins.has(key=key) + if _item : + del _item['pointer'] + print (f"""{CHECK_MARK} successfully loaded \033[1m{key}\033[0m found, version {_item['version']}""") + print (pd.DataFrame([_item])) + else: + print (f"{TIMES_MARK} unable to load \033[1m{key}\033[0m. Make sure it is registered") +app.add_typer(app_r,name='registry',help='This function allows labeling database access information') +app.add_typer(app_i,name="info",help="This function will print either license or supported database technologies") +app.add_typer(app_x, name="plugins",help="This function enables add/list/test of plugins in the registry") if __name__ == '__main__' : app() diff --git a/setup.py b/setup.py index 7bb44e8..f11a6ca 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ args = { "packages": find_packages(include=['info','transport', 'transport.*'])} args["keywords"]=['mongodb','duckdb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pyncclient','duckdb-engine','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','termcolor','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql'] +args["install_requires"] = ['pyncclient','duckdb-engine','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','termcolor','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql','pyspark','pydrill','sqlalchemy_drill'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] # if sys.version_info[0] == 2 : diff --git a/transport/__init__.py b/transport/__init__.py index b934760..33a3261 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -18,7 +18,7 @@ Source Code is available under MIT License: """ import numpy as np -from transport import sql, nosql, cloud, other +from transport import sql, nosql, cloud, other, warehouse import pandas as pd import json import os @@ -28,21 +28,26 @@ from transport.plugins import PluginLoader from transport import providers import copy from transport import registry - +from transport.plugins import Plugin PROVIDERS = {} def init(): global PROVIDERS - for _module in [cloud,sql,nosql,other] : + for _module in [cloud,sql,nosql,other,warehouse] : for _provider_name in dir(_module) : if _provider_name.startswith('__') or _provider_name == 'common': continue PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__} -def _getauthfile (path) : - f = open(path) - _object = json.loads(f.read()) - f.close() - return _object + # + # loading the registry + if not registry.isloaded() : + registry.load() + +# def _getauthfile (path) : +# f = open(path) +# _object = json.loads(f.read()) +# f.close() +# return _object def instance (**_args): """ This function returns an object of to read or write from a supported database provider/vendor @@ -52,16 +57,7 @@ def instance (**_args): kwargs These are arguments that are provider/vendor specific """ global PROVIDERS - # if not registry.isloaded () : - # if ('path' in _args and registry.exists(_args['path'] )) or registry.exists(): - # registry.load() if 'path' not in _args else registry.load(_args['path']) - # print ([' GOT IT']) - # if 'label' in _args and registry.isloaded(): - # _info = registry.get(_args['label']) - # if _info : - # # - # _args = dict(_args,**_info) - + if 'auth_file' in _args: if os.path.exists(_args['auth_file']) : # @@ -78,7 +74,7 @@ def instance (**_args): filename = _args['auth_file'] raise Exception(f" {filename} was not found or is invalid") if 'provider' not in _args and 'auth_file' not in _args : - if not registry.isloaded () : + if not registry.isloaded () : if ('path' in _args and registry.exists(_args['path'] )) or registry.exists(): registry.load() if 'path' not in _args else registry.load(_args['path']) _info = {} @@ -87,8 +83,6 @@ def instance (**_args): else: _info = registry.get() if _info : - # - # _args = dict(_args,**_info) _args = dict(_info,**_args) #-- we can override the registry parameters with our own arguments if 'provider' in _args and _args['provider'] in PROVIDERS : @@ -119,8 +113,32 @@ def instance (**_args): # for _delegate in _params : # loader.set(_delegate) - loader = None if 'plugins' not in _args else _args['plugins'] - return IReader(_agent,loader) if _context == 'read' else IWriter(_agent,loader) + _plugins = None if 'plugins' not in _args else _args['plugins'] + + # if registry.has('logger') : + # _kwa = registry.get('logger') + # _lmodule = getPROVIDERS[_kwa['provider']] + + if ( ('label' in _args and _args['label'] != 'logger') and registry.has('logger')): + # + # We did not request label called logger, so we are setting up a logger if it is specified in the registry + # + _kwargs = registry.get('logger') + _kwargs['context'] = 'write' + _kwargs['table'] =_module.__name__.split('.')[-1]+'_logs' + # _logger = instance(**_kwargs) + _module = PROVIDERS[_kwargs['provider']]['module'] + _logger = getattr(_module,'Writer') + _logger = _logger(**_kwargs) + else: + _logger = None + + _kwargs = {'agent':_agent,'plugins':_plugins,'logger':_logger} + if 'args' in _args : + _kwargs['args'] = _args['args'] + # _datatransport = IReader(_agent,_plugins,_logger) if _context == 'read' else IWriter(_agent,_plugins,_logger) + _datatransport = IReader(**_kwargs) if _context == 'read' else IWriter(**_kwargs) + return _datatransport else: # @@ -137,7 +155,14 @@ class get : if not _args or ('provider' not in _args and 'label' not in _args): _args['label'] = 'default' _args['context'] = 'read' - return instance(**_args) + # return instance(**_args) + # _args['logger'] = instance(**{'label':'logger','context':'write','table':'logs'}) + + _handler = instance(**_args) + # _handler.setLogger(get.logger()) + return _handler + + @staticmethod def writer(**_args): """ @@ -146,10 +171,26 @@ class get : if not _args or ('provider' not in _args and 'label' not in _args): _args['label'] = 'default' _args['context'] = 'write' - return instance(**_args) + # _args['logger'] = instance(**{'label':'logger','context':'write','table':'logs'}) + + _handler = instance(**_args) + # + # Implementing logging with the 'eat-your-own-dog-food' approach + # Using dependency injection to set the logger (problem with imports) + # + # _handler.setLogger(get.logger()) + return _handler + @staticmethod + def logger (): + if registry.has('logger') : + _args = registry.get('logger') + _args['context'] = 'write' + return instance(**_args) + return None @staticmethod def etl (**_args): if 'source' in _args and 'target' in _args : + return IETL(**_args) else: raise Exception ("Malformed input found, object must have both 'source' and 'target' attributes") diff --git a/transport/iowrapper.py b/transport/iowrapper.py index e3abf6c..e532e7d 100644 --- a/transport/iowrapper.py +++ b/transport/iowrapper.py @@ -5,7 +5,7 @@ NOTE: Plugins are converted to a pipeline, so we apply a pipeline when reading o - upon initialization we will load plugins - on read/write we apply a pipeline (if passed as an argument) """ -from transport.plugins import plugin, PluginLoader +from transport.plugins import Plugin, PluginLoader import transport from transport import providers from multiprocessing import Process @@ -16,7 +16,10 @@ class IO: """ Base wrapper class for read/write and support for logs """ - def __init__(self,_agent,plugins): + def __init__(self,**_args): + _agent = _args['agent'] + plugins = _args['plugins'] if 'plugins' not in _args else None + self._agent = _agent if plugins : self._init_plugins(plugins) @@ -63,8 +66,9 @@ class IReader(IO): """ This is a wrapper for read functionalities """ - def __init__(self,_agent,pipeline=None): - super().__init__(_agent,pipeline) + def __init__(self,**_args): + super().__init__(**_args) + def read(self,**_args): if 'plugins' in _args : self._init_plugins(_args['plugins']) @@ -75,8 +79,8 @@ class IReader(IO): # output data return _data class IWriter(IO): - def __init__(self,_agent,pipeline=None): - super().__init__(_agent,pipeline) + def __init__(self,**_args): #_agent,pipeline=None): + super().__init__(**_args) #_agent,pipeline) def write(self,_data,**_args): if 'plugins' in _args : self._init_plugins(_args['plugins']) @@ -94,7 +98,7 @@ class IETL(IReader) : This class performs an ETL operation by ineriting a read and adding writes as pipeline functions """ def __init__(self,**_args): - super().__init__(transport.get.reader(**_args['source'])) + super().__init__(agent=transport.get.reader(**_args['source']),plugins=None) if 'target' in _args: self._targets = _args['target'] if type(_args['target']) == list else [_args['target']] else: @@ -110,6 +114,8 @@ class IETL(IReader) : self.post(_data,**_kwargs) return _data + def run(self) : + return self.read() def post (self,_data,**_args) : """ This function returns an instance of a process that will perform the write operation diff --git a/transport/plugins/__init__.py b/transport/plugins/__init__.py index 26e5782..760b66c 100644 --- a/transport/plugins/__init__.py +++ b/transport/plugins/__init__.py @@ -11,8 +11,10 @@ import importlib as IL import importlib.util import sys import os +import pandas as pd +import time -class plugin : +class Plugin : """ Implementing function decorator for data-transport plugins (post-pre)-processing """ @@ -22,8 +24,9 @@ class plugin : :mode restrict to reader/writer :about tell what the function is about """ - self._name = _args['name'] - self._about = _args['about'] + self._name = _args['name'] if 'name' in _args else None + self._version = _args['version'] if 'version' in _args else '0.1' + self._doc = _args['doc'] if 'doc' in _args else "N/A" self._mode = _args['mode'] if 'mode' in _args else 'rw' def __call__(self,pointer,**kwargs): def wrapper(_args,**kwargs): @@ -32,57 +35,64 @@ class plugin : # @TODO: # add attributes to the wrapper object # + self._name = pointer.__name__ if not self._name else self._name setattr(wrapper,'transport',True) setattr(wrapper,'name',self._name) - setattr(wrapper,'mode',self._mode) - setattr(wrapper,'about',self._about) + setattr(wrapper,'version',self._version) + setattr(wrapper,'doc',self._doc) return wrapper - class PluginLoader : """ This class is intended to load a plugin and make it available and assess the quality of the developed plugin """ + def __init__(self,**_args): """ - :path location of the plugin (should be a single file) - :_names of functions to load """ - _names = _args['names'] if 'names' in _args else None - path = _args['path'] if 'path' in _args else None - self._names = _names if type(_names) == list else [_names] + # _names = _args['names'] if 'names' in _args else None + # path = _args['path'] if 'path' in _args else None + # self._names = _names if type(_names) == list else [_names] self._modules = {} self._names = [] - if path and os.path.exists(path) and _names: - for _name in self._names : - - spec = importlib.util.spec_from_file_location('private', path) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) #--loads it into sys.modules - if hasattr(module,_name) : - if self.isplugin(module,_name) : - self._modules[_name] = getattr(module,_name) - else: - print ([f'Found {_name}', 'not plugin']) - else: - # - # @TODO: We should log this somewhere some how - print (['skipping ',_name, hasattr(module,_name)]) - pass - else: - # - # Initialization is empty - self._names = [] + self._registry = _args['registry'] + pass - def set(self,_pointer) : + def load (self,**_args): + self._modules = {} + self._names = [] + path = _args ['path'] + if os.path.exists(path) : + _alias = path.split(os.sep)[-1] + spec = importlib.util.spec_from_file_location(_alias, path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) #--loads it into sys.modules + for _name in dir(module) : + if self.isplugin(module,_name) : + self._module[_name] = getattr(module,_name) + # self._names [_name] + def format (self,**_args): + uri = _args['alias'],_args['name'] + # def set(self,_pointer) : + def set(self,_key) : """ This function will set a pointer to the list of modules to be called This should be used within the context of using the framework as a library """ - _name = _pointer.__name__ + if type(_key).__name__ == 'function': + # + # The pointer is in the code provided by the user and loaded in memory + # + _pointer = _key + _key = 'inline@'+_key.__name__ + # self._names.append(_key.__name__) + else: + _pointer = self._registry.get(key=_key) + + if _pointer : + self._modules[_key] = _pointer + self._names.append(_key) - self._modules[_name] = _pointer - self._names.append(_name) def isplugin(self,module,name): """ This function determines if a module is a recognized plugin @@ -107,12 +117,31 @@ class PluginLoader : _n = len(self._names) return len(set(self._modules.keys()) & set (self._names)) / _n - def apply(self,_data): + def apply(self,_data,_logger=[]): + _input= {} + for _name in self._modules : - _pointer = self._modules[_name] - # - # @TODO: add exception handling - _data = _pointer(_data) + try: + _input = {'action':'plugin','object':_name,'input':{'status':'PASS'}} + _pointer = self._modules[_name] + if type(_data) == list : + _data = pd.DataFrame(_data) + _brow,_bcol = list(_data.shape) + + # + # @TODO: add exception handling + _data = _pointer(_data) + + _input['input']['shape'] = {'rows-dropped':_brow - _data.shape[0]} + except Exception as e: + _input['input']['status'] = 'FAILED' + print (e) + time.sleep(1) + if _logger: + try: + _logger(**_input) + except Exception as e: + pass return _data # def apply(self,_data,_name): # """ diff --git a/transport/providers/__init__.py b/transport/providers/__init__.py index 6422d74..b4cf37a 100644 --- a/transport/providers/__init__.py +++ b/transport/providers/__init__.py @@ -11,7 +11,7 @@ BIGQUERY ='bigquery' FILE = 'file' ETL = 'etl' -SQLITE = 'sqlite' +SQLITE = 'sqlite3' SQLITE3= 'sqlite3' DUCKDB = 'duckdb' @@ -44,7 +44,9 @@ PGSQL = POSTGRESQL AWS_S3 = 's3' RABBIT = RABBITMQ - - +ICEBERG='iceberg' +APACHE_ICEBERG = 'iceberg' +DRILL = 'drill' +APACHE_DRILL = 'drill' # QLISTENER = 'qlistener' \ No newline at end of file diff --git a/transport/registry.py b/transport/registry.py index f3dc8ac..1f612dc 100644 --- a/transport/registry.py +++ b/transport/registry.py @@ -220,6 +220,8 @@ def init (email,path=REGISTRY_PATH,override=False,_file=REGISTRY_FILE): def lookup (label): global DATA return label in DATA +has = lookup + def get (label='default') : global DATA return copy.copy(DATA[label]) if label in DATA else {} diff --git a/transport/warehouse/__init__.py b/transport/warehouse/__init__.py new file mode 100644 index 0000000..bcd76fd --- /dev/null +++ b/transport/warehouse/__init__.py @@ -0,0 +1,7 @@ +""" +This namespace/package is intended to handle read/writes against data warehouse solutions like : + - apache iceberg + - clickhouse (...) +""" + +from . import iceberg, drill \ No newline at end of file diff --git a/transport/warehouse/drill.py b/transport/warehouse/drill.py new file mode 100644 index 0000000..71f0e64 --- /dev/null +++ b/transport/warehouse/drill.py @@ -0,0 +1,55 @@ +import sqlalchemy +import pandas as pd +from .. sql.common import BaseReader , BaseWriter +import sqlalchemy as sqa + +class Drill : + __template = {'host':None,'port':None,'ssl':None,'table':None,'database':None} + def __init__(self,**_args): + + self._host = _args['host'] if 'host' in _args else 'localhost' + self._port = _args['port'] if 'port' in _args else self.get_default_port() + self._ssl = False if 'ssl' not in _args else _args['ssl'] + + self._table = _args['table'] if 'table' in _args else None + if self._table and '.' in self._table : + _seg = self._table.split('.') + if len(_seg) > 2 : + self._schema,self._database = _seg[:2] + else: + + self._database=_args['database'] + self._schema = self._database.split('.')[0] + + def _get_uri(self,**_args): + return f'drill+sadrill://{self._host}:{self._port}/{self._database}?use_ssl={self._ssl}' + def get_provider(self): + return "drill+sadrill" + def get_default_port(self): + return "8047" + def meta(self,**_args): + _table = _args['table'] if 'table' in _args else self._table + if '.' in _table : + _schema = _table.split('.')[:2] + _schema = '.'.join(_schema) + _table = _table.split('.')[-1] + else: + _schema = self._schema + + # _sql = f"select COLUMN_NAME AS name, CASE WHEN DATA_TYPE ='CHARACTER VARYING' THEN 'CHAR ( 125 )' ELSE DATA_TYPE END AS type from INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA='{_schema}' and TABLE_NAME='{_table}'" + _sql = f"select COLUMN_NAME AS name, CASE WHEN DATA_TYPE ='CHARACTER VARYING' THEN 'CHAR ( '||COLUMN_SIZE||' )' ELSE DATA_TYPE END AS type from INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA='{_schema}' and TABLE_NAME='{_table}'" + try: + _df = pd.read_sql(_sql,self._engine) + return _df.to_dict(orient='records') + except Exception as e: + print (e) + pass + return [] +class Reader (Drill,BaseReader) : + def __init__(self,**_args): + super().__init__(**_args) + self._chunksize = 0 if 'chunksize' not in _args else _args['chunksize'] + self._engine= sqa.create_engine(self._get_uri(),future=True) +class Writer(Drill,BaseWriter): + def __init__(self,**_args): + super().__init__(self,**_args) \ No newline at end of file diff --git a/transport/warehouse/iceberg.py b/transport/warehouse/iceberg.py new file mode 100644 index 0000000..4e73c62 --- /dev/null +++ b/transport/warehouse/iceberg.py @@ -0,0 +1,151 @@ +""" +dependency: + - spark and SPARK_HOME environment variable must be set +NOTE: + When using streaming option, insure that it is inline with default (1000 rows) or increase it in spark-defaults.conf + +""" +from pyspark.sql import SparkSession +from pyspark import SparkContext +from pyspark.sql.types import * +from pyspark.sql.functions import col, to_date, to_timestamp +import copy + +class Iceberg : + def __init__(self,**_args): + """ + providing catalog meta information (you must get this from apache iceberg) + """ + # + # Turning off logging (it's annoying & un-professional) + # + # _spconf = SparkContext() + # _spconf.setLogLevel("ERROR") + # + # @TODO: + # Make arrangements for additional configuration elements + # + self._session = SparkSession.builder.appName("data-transport").getOrCreate() + self._session.conf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") + # self._session.sparkContext.setLogLevel("ERROR") + self._catalog = self._session.catalog + self._table = _args['table'] if 'table' in _args else None + + if 'catalog' in _args : + # + # Let us set the default catalog + self._catalog.setCurrentCatalog(_args['catalog']) + + else: + # No current catalog has been set ... + pass + if 'database' in _args : + self._database = _args['database'] + self._catalog.setCurrentDatabase(self._database) + else: + # + # Should we set the default as the first one if available ? + # + pass + self._catalogName = self._catalog.currentCatalog() + self._databaseName = self._catalog.currentDatabase() + def meta (self,**_args) : + """ + This function should return the schema of a table (only) + """ + _schema = [] + try: + _table = _args['table'] if 'table' in _args else self._table + _tableName = self._getPrefix(**_args) + f".{_table}" + _tmp = self._session.table(_tableName).schema + _schema = _tmp.jsonValue()['fields'] + for _item in _schema : + del _item['nullable'],_item['metadata'] + except Exception as e: + + pass + return _schema + def _getPrefix (self,**_args): + _catName = self._catalogName if 'catalog' not in _args else _args['catalog'] + _datName = self._databaseName if 'database' not in _args else _args['database'] + + return '.'.join([_catName,_datName]) + def apply(self,_query): + """ + sql query/command to run against apache iceberg + """ + return self._session.sql(_query) + def has (self,**_args): + try: + _prefix = self._getPrefix(**_args) + if _prefix.endswith('.') : + return False + return _args['table'] in [_item.name for _item in self._catalog.listTables(_prefix)] + except Exception as e: + print (e) + return False + + def close(self): + self._session.stop() +class Reader(Iceberg) : + def __init__(self,**_args): + super().__init__(**_args) + def read(self,**_args): + _table = self._table + _prefix = self._getPrefix(**_args) + if 'table' in _args or _table: + _table = _args['table'] if 'table' in _args else _table + _table = _prefix + f'.{_table}' + return self._session.table(_table).toPandas() + else: + sql = _args['sql'] + return self._session.sql(sql).toPandas() + pass +class Writer (Iceberg): + """ + Writing data to an Apache Iceberg data warehouse (using pyspark) + """ + def __init__(self,**_args): + super().__init__(**_args) + self._mode = 'append' if 'mode' not in _args else _args['mode'] + self._table = None if 'table' not in _args else _args['table'] + def format (self,_schema) : + _iceSchema = StructType([]) + _map = {'integer':IntegerType(),'float':DoubleType(),'double':DoubleType(),'date':DateType(), + 'timestamp':TimestampType(),'datetime':TimestampType(),'string':StringType(),'varchar':StringType()} + for _item in _schema : + _name = _item['name'] + _type = _item['type'].lower() + if _type not in _map : + _iceType = StringType() + else: + _iceType = _map[_type] + + _iceSchema.add (StructField(_name,_iceType,True)) + return _iceSchema if len(_iceSchema) else [] + def write(self,_data,**_args): + _prefix = self._getPrefix(**_args) + if 'table' not in _args and not self._table : + raise Exception (f"Table Name should be specified for catalog/database {_prefix}") + _schema = self.format(_args['schema']) if 'schema' in _args else [] + if not _schema : + rdd = self._session.createDataFrame(_data,verifySchema=False) + else : + rdd = self._session.createDataFrame(_data,schema=_schema,verifySchema=True) + _mode = self._mode if 'mode' not in _args else _args['mode'] + _table = self._table if 'table' not in _args else _args['table'] + + # print (_data.shape,_mode,_table) + + if not self._session.catalog.tableExists(_table): + # # @TODO: + # # add partitioning information here + rdd.writeTo(_table).using('iceberg').create() + + # # _mode = 'overwrite' + # # rdd.write.format('iceberg').mode(_mode).saveAsTable(_table) + else: + # rdd.writeTo(_table).append() + # # _table = f'{_prefix}.{_table}' + + rdd.coalesce(10).write.format('iceberg').mode('append').save(_table) From c3627586b3743acb917b852c020acde10f19f8e3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 31 Dec 2024 12:32:14 -0600 Subject: [PATCH 243/271] fix: refactor cli switches --- bin/transport | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/bin/transport b/bin/transport index d2072f7..eb8b17a 100755 --- a/bin/transport +++ b/bin/transport @@ -36,9 +36,10 @@ from enum import Enum from rich import print app = typer.Typer() -app_x = typer.Typer() -app_i = typer.Typer() -app_r = typer.Typer() +app_e = typer.Typer() #-- handles etl (run, generate) +app_x = typer.Typer() #-- handles plugins (list,add, test) +app_i = typer.Typer() #-- handles information (version, license) +app_r = typer.Typer() #-- handles registry REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport']) REGISTRY_FILE= 'transport-registry.json' CHECK_MARK = '[ [green]\u2713[/green] ]' #' '.join(['[',colored(u'\u2713', 'green'),']']) @@ -54,7 +55,7 @@ def wait (jobs): while jobs : jobs = [pthread for pthread in jobs if pthread.is_alive()] -@app.command(name="etl") +@app_e.command(name="run") def apply (path:Annotated[str,typer.Argument(help="path of the configuration file")], index:int = typer.Option(default= None, help="index of the item of interest, otherwise everything in the file will be processed"), batch:int = typer.Option(default=5, help="The number of parallel processes to run at once") @@ -113,7 +114,7 @@ def info(): print () print (transport.__license__) -@app.command() +@app_e.command() def generate (path:Annotated[str,typer.Argument(help="path of the ETL configuration file template (name included)")]): """ This function will generate a configuration template to give a sense of how to create one @@ -206,6 +207,7 @@ def registry_test (key): print (pd.DataFrame([_item])) else: print (f"{TIMES_MARK} unable to load \033[1m{key}\033[0m. Make sure it is registered") +app.add_typer(app_e,name='etl',help="This function will run etl or generate a template etl configuration file") app.add_typer(app_r,name='registry',help='This function allows labeling database access information') app.add_typer(app_i,name="info",help="This function will print either license or supported database technologies") app.add_typer(app_x, name="plugins",help="This function enables add/list/test of plugins in the registry") From 49ebd4a43216d883d4cfe31660a7444b1677b4d0 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 2 Feb 2025 22:36:28 -0600 Subject: [PATCH 244/271] bug fix: close & etl --- transport/iowrapper.py | 9 +++++++-- transport/sql/common.py | 40 +++++++++++++++++++++++++--------------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/transport/iowrapper.py b/transport/iowrapper.py index e532e7d..cf5d717 100644 --- a/transport/iowrapper.py +++ b/transport/iowrapper.py @@ -109,8 +109,10 @@ class IETL(IReader) : self._hasParentProcess = False if 'hasParentProcess' not in _args else _args['hasParentProcess'] def read(self,**_args): _data = super().read(**_args) - + _schema = super().meta() for _kwargs in self._targets : + if _schema : + _kwargs['schema'] = _schema self.post(_data,**_kwargs) return _data @@ -122,5 +124,8 @@ class IETL(IReader) : :_args parameters associated with writer object """ writer = transport.get.writer(**_args) - writer.write(_data) + if 'schema' in _args : + writer.write(_data,schema=_args['schema']) + else: + writer.write(_data) writer.close() \ No newline at end of file diff --git a/transport/sql/common.py b/transport/sql/common.py index 0a55ed7..1a7e8a3 100644 --- a/transport/sql/common.py +++ b/transport/sql/common.py @@ -3,7 +3,7 @@ This file encapsulates common operations associated with SQL databases via SQLAl """ import sqlalchemy as sqa -from sqlalchemy import text +from sqlalchemy import text , MetaData, inspect import pandas as pd @@ -34,20 +34,26 @@ class Base: :table optional name of the table (can be fully qualified) """ _table = self._table if 'table' not in _args else _args['table'] + _map = {'TINYINT':'INTEGER','BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'} _schema = [] - if _table : - if sqa.__version__.startswith('1.') : - _handler = sqa.MetaData(bind=self._engine) - _handler.reflect() - else: - # - # sqlalchemy's version 2.+ - _handler = sqa.MetaData() - _handler.reflect(bind=self._engine) - # - # Let us extract the schema with the native types - _map = {'BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'} - _schema = [{"name":_attr.name,"type":_map.get(str(_attr.type),str(_attr.type))} for _attr in _handler.tables[_table].columns] + # if _table : + # if sqa.__version__.startswith('1.') : + # _handler = sqa.MetaData(bind=self._engine) + # _handler.reflect() + # else: + # # + # # sqlalchemy's version 2.+ + # _handler = sqa.MetaData() + # _handler.reflect(bind=self._engine) + # # + # # Let us extract the schema with the native types + # _map = {'BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'} + # _schema = [{"name":_attr.name,"type":_map.get(str(_attr.type),str(_attr.type))} for _attr in _handler.tables[_table].columns] + # + + _inspector = inspect(self._engine) + _columns = _inspector.get_columns(_table) + _schema = [{'name':column['name'],'type':_map.get(str(column['type']),str(column['type'])) } for column in _columns] return _schema def has(self,**_args): return self.meta(**_args) @@ -94,7 +100,11 @@ class SQLBase(Base): # _uri = [_item.strip() for _item in _uri if _item.strip()] # return '/'.join(_uri) return f'{_provider}://{_host}/{_database}' if _account == '' else f'{_provider}://{_account}{_host}/{_database}' - + def close(self,) : + try: + self._engine.dispose() + except : + pass class BaseReader(SQLBase): def __init__(self,**_args): super().__init__(**_args) From 1a8112f1521e4aec9d318940834f6358a443f7f8 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Sun, 2 Feb 2025 22:37:07 -0600 Subject: [PATCH 245/271] adding iceberg notebook --- notebooks/iceberg.ipynb | 138 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 notebooks/iceberg.ipynb diff --git a/notebooks/iceberg.ipynb b/notebooks/iceberg.ipynb new file mode 100644 index 0000000..849e088 --- /dev/null +++ b/notebooks/iceberg.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Writing to Apache Iceberg\n", + "\n", + "1. Insure you have a Google Bigquery service account key on disk\n", + "2. The service key location is set as an environment variable **BQ_KEY**\n", + "3. The dataset will be automatically created within the project associated with the service key\n", + "\n", + "The cell below creates a dataframe that will be stored within Google Bigquery" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['data transport version ', '2.4.0']\n" + ] + } + ], + "source": [ + "#\n", + "# Writing to Google Bigquery database\n", + "#\n", + "import transport\n", + "from transport import providers\n", + "import pandas as pd\n", + "import os\n", + "\n", + "PRIVATE_KEY = os.environ['BQ_KEY'] #-- location of the service key\n", + "DATASET = 'demo'\n", + "_data = pd.DataFrame({\"name\":['James Bond','Steve Rogers','Steve Nyemba'],'age':[55,150,44]})\n", + "# bqw = transport.get.writer(provider=providers.ICEBERG,catalog='mz',database='edw.mz',table='friends')\n", + "bqw = transport.get.writer(provider=providers.ICEBERG,table='edw.mz.friends')\n", + "bqw.write(_data,if_exists='replace') #-- default is append\n", + "print (['data transport version ', transport.__version__])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reading from Google Bigquery\n", + "\n", + "The cell below reads the data that has been written by the cell above and computes the average age within a Google Bigquery (simple query). \n", + "\n", + "- Basic read of the designated table (friends) created above\n", + "- Execute an aggregate SQL against the table\n", + "\n", + "**NOTE**\n", + "\n", + "By design **read** object are separated from **write** objects in order to avoid accidental writes to the database.\n", + "Read objects are created with **transport.get.reader** whereas write objects are created with **transport.get.writer**" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " name age\n", + "0 James Bond 55\n", + "1 Steve Rogers 150\n", + "2 Steve Nyemba 44\n", + "--------- STATISTICS ------------\n" + ] + } + ], + "source": [ + "\n", + "import transport\n", + "from transport import providers\n", + "import os\n", + "PRIVATE_KEY=os.environ['BQ_KEY']\n", + "pgr = transport.get.reader(provider=providers.ICEBERG,database='edw.mz')\n", + "_df = pgr.read(table='friends')\n", + "_query = 'SELECT COUNT(*) _counts, AVG(age) from friends'\n", + "_sdf = pgr.read(sql=_query)\n", + "print (_df)\n", + "print ('--------- STATISTICS ------------')\n", + "# print (_sdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An **auth-file** is a file that contains database parameters used to access the database. \n", + "For code in shared environments, we recommend \n", + "\n", + "1. Having the **auth-file** stored on disk \n", + "2. and the location of the file is set to an environment variable.\n", + "\n", + "To generate a template of the **auth-file** open the **file generator wizard** found at visit https://healthcareio.the-phi.com/data-transport" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From cdf783143e4b9cdfe2dbe9829370feeec6421be0 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 11 Feb 2025 12:52:44 -0600 Subject: [PATCH 246/271] ... --- transport/plugins/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/transport/plugins/__init__.py b/transport/plugins/__init__.py index 760b66c..93ba11c 100644 --- a/transport/plugins/__init__.py +++ b/transport/plugins/__init__.py @@ -59,6 +59,9 @@ class PluginLoader : pass def load (self,**_args): + """ + This function loads a plugin + """ self._modules = {} self._names = [] path = _args ['path'] From 30645e46bd538d4cfd916c662137ed521954d481 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 19 Feb 2025 23:03:14 -0600 Subject: [PATCH 247/271] bug fix: readonly for duckdb --- transport/sql/common.py | 12 +++++++++--- transport/sql/duckdb.py | 4 +++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/transport/sql/common.py b/transport/sql/common.py index 1a7e8a3..f647acb 100644 --- a/transport/sql/common.py +++ b/transport/sql/common.py @@ -13,7 +13,13 @@ class Base: self._port = None self._database = _args['database'] self._table = _args['table'] if 'table' in _args else None - self._engine= sqa.create_engine(self._get_uri(**_args),future=True) + _uri = self._get_uri(**_args) + if type(_uri) == str : + self._engine= sqa.create_engine(_uri,future=True) + else: + + _uri,_kwargs = _uri + self._engine= sqa.create_engine(_uri,**_kwargs,future=True) def _set_uri(self,**_args) : """ :provider provider @@ -64,8 +70,8 @@ class Base: @TODO: Execution of stored procedures """ - if sql.lower().startswith('select') or sql.lower().startswith('with') : - + if sql.strip().lower().startswith('select') or sql.strip().lower().startswith('with') or sql.strip().startswith('show'): + print (self._engine) return pd.read_sql(sql,self._engine) else: _handler = self._engine.connect() diff --git a/transport/sql/duckdb.py b/transport/sql/duckdb.py index 06f66e5..97fb3fa 100644 --- a/transport/sql/duckdb.py +++ b/transport/sql/duckdb.py @@ -15,9 +15,11 @@ class Duck : def _get_uri(self,**_args): return f"""duckdb:///{self.database}""" class Reader(Duck,BaseReader) : - def __init__(self,**_args): + def __init__(self,**_args): Duck.__init__(self,**_args) BaseReader.__init__(self,**_args) + def _get_uri(self,**_args): + return super()._get_uri(**_args),{'connect_args':{'read_only':True}} class Writer(Duck,BaseWriter): def __init__(self,**_args): Duck.__init__(self,**_args) From afa442ea8ddb3fc9cf270028f8b3b9b63089a81b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 19 Feb 2025 23:07:47 -0600 Subject: [PATCH 248/271] versioning update edition --- info/__init__.py | 3 ++- setup.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/info/__init__.py b/info/__init__.py index 3eded86..bbdf8fd 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,7 +1,8 @@ __app_name__ = 'data-transport' __author__ = 'The Phi Technology' -__version__= '2.2.6' +__version__= '2.2.8' __email__ = "info@the-phi.com" +__edition__= 'ce' __license__=f""" Copyright 2010 - 2024, Steve L. Nyemba diff --git a/setup.py b/setup.py index f11a6ca..e8d2de0 100644 --- a/setup.py +++ b/setup.py @@ -5,14 +5,14 @@ from setuptools import setup, find_packages import os import sys # from version import __version__,__author__ -from info import __version__, __author__,__app_name__,__license__ +from info import __version__, __author__,__app_name__,__license__,__edition___ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":__app_name__, - "version":__version__, + "version":'-'.join([__version__,__edition__]), "author":__author__,"author_email":"info@the-phi.com", "license":__license__, # "packages":["transport","info","transport/sql"]}, From a1b5f2743ca8a046d5c721841df90427f96a1347 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 19 Feb 2025 23:26:12 -0600 Subject: [PATCH 249/271] bug fixes ... --- info/__init__.py | 9 +++++---- setup.py | 8 ++++++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/info/__init__.py b/info/__init__.py index bbdf8fd..32cdcc4 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -14,9 +14,10 @@ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR I """ -__whatsnew__=f"""version {__version__}, focuses on collaborative environments like jupyter-base servers (apache zeppelin; jupyter notebook, jupyterlab, jupyterhub) +__whatsnew__=f"""version {__version__}, +1. Added support for read/write logs as well as plugins (when applied) +2. Bug fix with duckdb (adding readonly) for readers because there are issues with threads & processes +3. support for streaming data, important to use this with large volumes of data + - 1. simpler syntax to create readers/writers - 2. auth-file registry that can be referenced using a label - 3. duckdb support """ diff --git a/setup.py b/setup.py index e8d2de0..a0cfeed 100644 --- a/setup.py +++ b/setup.py @@ -5,14 +5,14 @@ from setuptools import setup, find_packages import os import sys # from version import __version__,__author__ -from info import __version__, __author__,__app_name__,__license__,__edition___ +from info import __version__, __author__,__app_name__,__license__,__edition__ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() args = { "name":__app_name__, - "version":'-'.join([__version__,__edition__]), + "version":__version__, "author":__author__,"author_email":"info@the-phi.com", "license":__license__, # "packages":["transport","info","transport/sql"]}, @@ -22,6 +22,10 @@ args["keywords"]=['mongodb','duckdb','couchdb','rabbitmq','file','read','write', args["install_requires"] = ['pyncclient','duckdb-engine','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','termcolor','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql','pyspark','pydrill','sqlalchemy_drill'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] +args['classifiers'] = ['Programming Language :: Python :: 3', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + ], # if sys.version_info[0] == 2 : # args['use_2to3'] = True # args['use_2to3_exclude_fixers']=['lib2to3.fixes.fix_import'] From eaa2b99a2d48c990d44d8cdf07ec8cb1a5b77184 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 24 Feb 2025 09:26:15 -0600 Subject: [PATCH 250/271] bug fix: schema (postgresql) construct --- transport/iowrapper.py | 12 ++++++------ transport/sql/common.py | 11 +++++++++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/transport/iowrapper.py b/transport/iowrapper.py index cf5d717..700b589 100644 --- a/transport/iowrapper.py +++ b/transport/iowrapper.py @@ -45,12 +45,12 @@ class IO: def close(self): if hasattr(self._agent,'close') : self._agent.close() - def apply(self): - """ - applying pre/post conditions given a pipeline expression - """ - for _pointer in self._plugins : - _data = _pointer(_data) + # def apply(self): + # """ + # applying pre/post conditions given a pipeline expression + # """ + # for _pointer in self._plugins : + # _data = _pointer(_data) def apply(self,_query): if hasattr(self._agent,'apply') : return self._agent.apply(_query) diff --git a/transport/sql/common.py b/transport/sql/common.py index f647acb..304e945 100644 --- a/transport/sql/common.py +++ b/transport/sql/common.py @@ -71,7 +71,7 @@ class Base: @TODO: Execution of stored procedures """ if sql.strip().lower().startswith('select') or sql.strip().lower().startswith('with') or sql.strip().startswith('show'): - print (self._engine) + return pd.read_sql(sql,self._engine) else: _handler = self._engine.connect() @@ -83,6 +83,7 @@ class Base: class SQLBase(Base): def __init__(self,**_args): super().__init__(**_args) + self._schema = _args.get('schema',None) def get_provider(self): raise Exception ("Provider Needs to be set ...") def get_default_port(self) : @@ -122,6 +123,8 @@ class BaseReader(SQLBase): sql = _args['sql'] else: _table = _args['table'] if 'table' in _args else self._table + if self._schema and type(self._schema) == str : + _table = f'{self._schema}.{_table}' sql = f'SELECT * FROM {_table}' return self.apply(sql) @@ -132,6 +135,7 @@ class BaseWriter (SQLBase): """ def __init__(self,**_args): super().__init__(**_args) + def write(self,_data,**_args): if type(_data) == dict : _df = pd.DataFrame(_data) @@ -151,5 +155,8 @@ class BaseWriter (SQLBase): # _mode['schema'] = _args['schema'] # if 'if_exists' in _args : # _mode['if_exists'] = _args['if_exists'] - + if 'schema' in _args and type(_args['schema']) == str: + self._schema = _args.get('schema',None) + if self._schema : + _mode['schema'] = self._schema _df.to_sql(_table,self._engine,**_mode) \ No newline at end of file From dad2956a8c55d90c842616348abc6b5dbc6f2102 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 24 Feb 2025 09:29:42 -0600 Subject: [PATCH 251/271] version update --- info/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/info/__init__.py b/info/__init__.py index 32cdcc4..9355349 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,8 +1,8 @@ __app_name__ = 'data-transport' __author__ = 'The Phi Technology' -__version__= '2.2.8' +__version__= '2.2.10' __email__ = "info@the-phi.com" -__edition__= 'ce' +__edition__= 'community' __license__=f""" Copyright 2010 - 2024, Steve L. Nyemba From dd10f6db78db480f83e57332b0f5b5c4a4d0a67d Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 24 Feb 2025 09:35:36 -0600 Subject: [PATCH 252/271] bug fix: version & cli --- bin/transport | 14 +++++++++++--- transport/__init__.py | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/bin/transport b/bin/transport index eb8b17a..6ca01bc 100755 --- a/bin/transport +++ b/bin/transport @@ -103,14 +103,22 @@ def supported (format:Annotated[str,typer.Argument(help="format of the output, s else: print (_df) print () - +@app_i.command(name="version") +def version (): + """ + This function will return the version of the data-transport + """ + print() + print (f'[bold] {transport.__app_name__} ,[blue] {transport.__edition__} edition [/blue], version {transport.__version__}[/bold]') + print () + @app_i.command(name="license") def info(): """ This function will display version and license information """ - - print (f'[bold] {transport.__app_name__} ,version {transport.__version__}[/bold]') + print() + print (f'[bold] {transport.__app_name__} ,{transport.__edition__}, version {transport.__version__}[/bold]') print () print (transport.__license__) diff --git a/transport/__init__.py b/transport/__init__.py index 33a3261..c3bb901 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -22,7 +22,7 @@ from transport import sql, nosql, cloud, other, warehouse import pandas as pd import json import os -from info import __version__,__author__,__email__,__license__,__app_name__,__whatsnew__ +from info import __version__,__author__,__email__,__license__,__app_name__,__whatsnew__,__edition__ from transport.iowrapper import IWriter, IReader, IETL from transport.plugins import PluginLoader from transport import providers From 469c6f89a2a3f7ccc2391831578b266b9f8e7cb4 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 5 Mar 2025 21:24:15 -0600 Subject: [PATCH 253/271] fixes with plugin handler --- bin/transport | 70 +++++++---- transport/iowrapper.py | 52 +++++--- transport/registry.py | 264 ++++++++++++++++++++--------------------- 3 files changed, 212 insertions(+), 174 deletions(-) diff --git a/bin/transport b/bin/transport index 6ca01bc..19b664e 100755 --- a/bin/transport +++ b/bin/transport @@ -34,6 +34,8 @@ import time from termcolor import colored from enum import Enum from rich import print +import plugin_ix as pix + app = typer.Typer() app_e = typer.Typer() #-- handles etl (run, generate) @@ -147,7 +149,7 @@ def initregistry (email:Annotated[str,typer.Argument(help="email")], path:str=typer.Option(default=REGISTRY_PATH,help="path or location of the configuration file"), override:bool=typer.Option(default=False,help="override existing configuration or not")): """ - This functiion will initialize the registry and have both application and calling code loading the database parameters by a label + This functiion will initialize the data-transport registry and have both application and calling code loading the database parameters by a label """ try: @@ -179,42 +181,62 @@ def register (label:Annotated[str,typer.Argument(help="unique label that will be pass @app_x.command(name='add') def register_plugs ( - alias:Annotated[str,typer.Argument(help="unique alias fo the file being registered")], - path:Annotated[str,typer.Argument(help="path of the python file, that contains functions")] + alias:Annotated[str,typer.Argument(help="unique function name within a file")], + path:Annotated[str,typer.Argument(help="path of the python file, that contains functions")], + folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry folder"), + ): """ - This function will register a file and the functions within will be refrences . in a configuration file + This function will register a file and the functions within we are interested in using """ - transport.registry.plugins.init() - _log = transport.registry.plugins.add(alias,path) + if ',' in alias : + alias = [_name.strip() for _name in alias.split(',') if _name.strip() != '' ] + else: + alias = [alias.strip()] + _pregistry = pix.Registry(folder=folder,plugin_folder='plugins/code') + _log = _pregistry.set(path,alias) + # transport.registry.plugins.init() + # _log = transport.registry.plugins.add(alias,path) _mark = TIMES_MARK if not _log else CHECK_MARK - _msg = f"""Could NOT add the [bold]{alias}[/bold]to the registry""" if not _log else f""" successfully added {alias}, {len(_log)} functions added""" + _msg = f"""Could NOT add the [bold]{alias}[/bold]to the registry""" if not _log else f""" successfully added {alias}, {_log} functions registered""" print (f"""{_mark} {_msg}""") @app_x.command(name="list") -def registry_list (): - - transport.registry.plugins.init() - _d = [] - for _alias in transport.registry.plugins._data : - _data = transport.registry.plugins._data[_alias] - _d += [{'alias':_alias,"plugin-count":len(_data['content']),'e.g':'@'.join([_alias,_data['content'][0]]),'plugins':json.dumps(_data['content'])}] - if _d: - print (pd.DataFrame(_d)) +def registry_list (folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport configuration folder")): + """ + This function will list all the plugins (python functions/files) that are registered and can be reused + """ + _pregistry = pix.Registry(folder=folder) + _df = _pregistry.stats() + if _df.empty : + print (f"{TIMES_MARK} registry at {folder} is not ready") else: - print (f"""{TIMES_MARK}, Plugin registry is not available or needs initialization""") + print (_df) +@app_x.command ("has") +def registry_has (alias:Annotated[str,typer.Argument(help="alias of a function function@file or file.function")], + folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry file")) : + _pregistry = pix.Registry(folder=folder) + if _pregistry.has(alias) : + _msg = f"{CHECK_MARK} {alias} was [bold] found [/bold] in registry " + else: + _msg = f"{TIMES_MARK} {alias} was [bold] NOT found [/bold] in registry " + print (_msg) + @app_x.command(name="test") -def registry_test (key): +def registry_test (alias:Annotated[str,typer.Argument(help="alias of a function function@file or file.function")], + folder:str=typer.Option(default=REGISTRY_PATH,help="path of the data-transport registry folder")) : + _pregistry = pix.Registry(folder=folder) """ This function allows to test syntax for a plugin i.e in terms of alias@function """ - _item = transport.registry.plugins.has(key=key) - if _item : - del _item['pointer'] - print (f"""{CHECK_MARK} successfully loaded \033[1m{key}\033[0m found, version {_item['version']}""") - print (pd.DataFrame([_item])) + # _item = transport.registry.plugins.has(key=key) + _pointer = _pregistry.get(alias) if _pregistry.has(alias) else None + + if _pointer: + print (f"""{CHECK_MARK} successfully loaded [bold] {alias}[/bold] found in {folder}""") + else: - print (f"{TIMES_MARK} unable to load \033[1m{key}\033[0m. Make sure it is registered") + print (f"{TIMES_MARK} unable to load {alias}. Make sure it is registered") app.add_typer(app_e,name='etl',help="This function will run etl or generate a template etl configuration file") app.add_typer(app_r,name='registry',help='This function allows labeling database access information') app.add_typer(app_i,name="info",help="This function will print either license or supported database technologies") diff --git a/transport/iowrapper.py b/transport/iowrapper.py index 700b589..396135e 100644 --- a/transport/iowrapper.py +++ b/transport/iowrapper.py @@ -11,6 +11,7 @@ from transport import providers from multiprocessing import Process import time +import plugin_ix class IO: """ @@ -21,20 +22,25 @@ class IO: plugins = _args['plugins'] if 'plugins' not in _args else None self._agent = _agent + self._ixloader = plugin_ix.Loader () #-- if plugins : - self._init_plugins(plugins) - else: - self._plugins = None + self.init_plugins(plugins) + # for _ref in plugins : + # self._ixloader.set(_ref) + # if plugins : + # self._init_plugins(plugins) + # else: + # self._plugins = None - def _init_plugins(self,_args): - """ - This function will load pipelined functions as a plugin loader - """ - if 'path' in _args and 'names' in _args : - self._plugins = PluginLoader(**_args) - else: - self._plugins = PluginLoader() - [self._plugins.set(_pointer) for _pointer in _args] + # def _init_plugins(self,_args): + # """ + # This function will load pipelined functions as a plugin loader + # """ + # if 'path' in _args and 'names' in _args : + # self._plugins = PluginLoader(**_args) + # else: + # self._plugins = PluginLoader() + # [self._plugins.set(_pointer) for _pointer in _args] # # @TODO: We should have a way to log what plugins are loaded and ready to use def meta (self,**_args): @@ -62,6 +68,10 @@ class IO: pointer = getattr(self._agent,_name) return pointer(_query) return None + def init_plugins(self,plugins): + for _ref in plugins : + self._ixloader.set(_ref) + class IReader(IO): """ This is a wrapper for read functionalities @@ -71,22 +81,28 @@ class IReader(IO): def read(self,**_args): if 'plugins' in _args : - self._init_plugins(_args['plugins']) + self.init_plugins(_args['plugins']) + _data = self._agent.read(**_args) - if self._plugins and self._plugins.ratio() > 0 : - _data = self._plugins.apply(_data) + # if self._plugins and self._plugins.ratio() > 0 : + # _data = self._plugins.apply(_data) # # output data + + # + # applying the the design pattern + _data = self._ixloader.visitor(_data) return _data class IWriter(IO): def __init__(self,**_args): #_agent,pipeline=None): super().__init__(**_args) #_agent,pipeline) def write(self,_data,**_args): + # if 'plugins' in _args : + # self._init_plugins(_args['plugins']) if 'plugins' in _args : - self._init_plugins(_args['plugins']) - if self._plugins and self._plugins.ratio() > 0 : - _data = self._plugins.apply(_data) + self.init_plugins(_args['plugins']) + self._ixloader.visitor(_data) self._agent.write(_data,**_args) # diff --git a/transport/registry.py b/transport/registry.py index 1f612dc..71909f6 100644 --- a/transport/registry.py +++ b/transport/registry.py @@ -21,161 +21,161 @@ if 'DATA_TRANSPORT_REGISTRY_PATH' in os.environ : REGISTRY_PATH = os.environ['DATA_TRANSPORT_REGISTRY_PATH'] REGISTRY_FILE= 'transport-registry.json' DATA = {} -class plugins: - # - # This is a utility function that should enable management of plugins-registry - # The class allows to add/remove elements - # - # @TODO: add read/write properties to the class (better design practice) - # - _data = {} - FOLDER = os.sep.join([REGISTRY_PATH,'plugins']) - CODE = os.sep.join([REGISTRY_PATH,'plugins','code']) - FILE = os.sep.join([REGISTRY_PATH,'plugin-registry.json']) - @staticmethod - def init(): +# class plugins: +# # +# # This is a utility function that should enable management of plugins-registry +# # The class allows to add/remove elements +# # +# # @TODO: add read/write properties to the class (better design practice) +# # +# _data = {} +# FOLDER = os.sep.join([REGISTRY_PATH,'plugins']) +# CODE = os.sep.join([REGISTRY_PATH,'plugins','code']) +# FILE = os.sep.join([REGISTRY_PATH,'plugin-registry.json']) +# @staticmethod +# def init(): - if not os.path.exists(plugins.FOLDER) : - os.makedirs(plugins.FOLDER) - if not os.path.exists(plugins.CODE): - os.makedirs(plugins.CODE) - if not os.path.exists(plugins.FILE): - f = open(plugins.FILE,'w') - f.write("{}") - f.close() - plugins._read() #-- will load data as a side effect +# if not os.path.exists(plugins.FOLDER) : +# os.makedirs(plugins.FOLDER) +# if not os.path.exists(plugins.CODE): +# os.makedirs(plugins.CODE) +# if not os.path.exists(plugins.FILE): +# f = open(plugins.FILE,'w') +# f.write("{}") +# f.close() +# plugins._read() #-- will load data as a side effect - @staticmethod - def copy (path) : +# @staticmethod +# def copy (path) : - shutil.copy2(path,plugins.CODE) - @staticmethod - def _read (): - f = open(plugins.FILE) - try: - _data = json.loads(f.read()) - f.close() - except Exception as e: - print (f"Corrupted registry, resetting ...") - _data = {} - plugins._write(_data) +# shutil.copy2(path,plugins.CODE) +# @staticmethod +# def _read (): +# f = open(plugins.FILE) +# try: +# _data = json.loads(f.read()) +# f.close() +# except Exception as e: +# print (f"Corrupted registry, resetting ...") +# _data = {} +# plugins._write(_data) - plugins._data = _data - @staticmethod - def _write (_data): - f = open(plugins.FILE,'w') - f.write(json.dumps(_data)) - f.close() - plugins._data = _data +# plugins._data = _data +# @staticmethod +# def _write (_data): +# f = open(plugins.FILE,'w') +# f.write(json.dumps(_data)) +# f.close() +# plugins._data = _data - @staticmethod - def inspect (_path): - _names = [] +# @staticmethod +# def inspect (_path): +# _names = [] - if os.path.exists(_path) : - _filename = _path.split(os.sep)[-1] - spec = importlib.util.spec_from_file_location(_filename, _path) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) +# if os.path.exists(_path) : +# _filename = _path.split(os.sep)[-1] +# spec = importlib.util.spec_from_file_location(_filename, _path) +# module = importlib.util.module_from_spec(spec) +# spec.loader.exec_module(module) - # _names = [{'name':getattr(getattr(module,_name),'name'),'pointer':getattr(module,_name)} for _name in dir(module) if type( getattr(module,_name)).__name__ == 'function'] - for _name in dir(module) : - _pointer = getattr(module,_name) - if hasattr(_pointer,'transport') : - _item = {'real_name':_name,'name':getattr(_pointer,'name'),'pointer':_pointer,'version':getattr(_pointer,'version')} - _names.append(_item) +# # _names = [{'name':getattr(getattr(module,_name),'name'),'pointer':getattr(module,_name)} for _name in dir(module) if type( getattr(module,_name)).__name__ == 'function'] +# for _name in dir(module) : +# _pointer = getattr(module,_name) +# if hasattr(_pointer,'transport') : +# _item = {'real_name':_name,'name':getattr(_pointer,'name'),'pointer':_pointer,'version':getattr(_pointer,'version')} +# _names.append(_item) - return _names - @staticmethod - def add (alias,path): - """ - Add overwrite the registry entries - """ - _names = plugins.inspect (path) - _log = [] +# return _names +# @staticmethod +# def add (alias,path): +# """ +# Add overwrite the registry entries +# """ +# _names = plugins.inspect (path) +# _log = [] - if _names : - # - # We should make sure we have all the plugins with the attributes (transport,name) set - _names = [_item for _item in _names if hasattr(_item['pointer'],'transport') ] - if _names : - plugins.copy(path) - _content = [] +# if _names : +# # +# # We should make sure we have all the plugins with the attributes (transport,name) set +# _names = [_item for _item in _names if hasattr(_item['pointer'],'transport') ] +# if _names : +# plugins.copy(path) +# _content = [] - for _item in _names : - _key = '@'.join([alias,_item['name']]) - _log.append(_item['name']) - # - # Let us update the registry - # - plugins.update(alias,path,_log) - return _log +# for _item in _names : +# _key = '@'.join([alias,_item['name']]) +# _log.append(_item['name']) +# # +# # Let us update the registry +# # +# plugins.update(alias,path,_log) +# return _log - @staticmethod - def update (alias,path,_log) : - """ - updating the registry entries of the plugins (management data) - """ - # f = open(plugins.FILE) - # _data = json.loads(f.read()) - # f.close() - _data = plugins._data - # _log = plugins.add(alias,path) +# @staticmethod +# def update (alias,path,_log) : +# """ +# updating the registry entries of the plugins (management data) +# """ +# # f = open(plugins.FILE) +# # _data = json.loads(f.read()) +# # f.close() +# _data = plugins._data +# # _log = plugins.add(alias,path) - if _log : - _data[alias] = {'content':_log,'name':path.split(os.sep)[-1]} - plugins._write(_data) #-- will update data as a side effect +# if _log : +# _data[alias] = {'content':_log,'name':path.split(os.sep)[-1]} +# plugins._write(_data) #-- will update data as a side effect - return _log - @staticmethod - def get(**_args) : - # f = open(plugins.FILE) - # _data = json.loads(f.read()) - # f.close() - # if 'key' in _args : - # alias,name = _args['key'].split('.') if '.' in _args['key'] else _args['key'].split('@') - # else : - # alias = _args['alias'] - # name = _args['name'] +# return _log +# @staticmethod +# def get(**_args) : +# # f = open(plugins.FILE) +# # _data = json.loads(f.read()) +# # f.close() +# # if 'key' in _args : +# # alias,name = _args['key'].split('.') if '.' in _args['key'] else _args['key'].split('@') +# # else : +# # alias = _args['alias'] +# # name = _args['name'] - # if alias in _data : +# # if alias in _data : - # _path = os.sep.join([plugins.CODE,_data[alias]['name']]) - # _item = [_item for _item in plugins.inspect(_path) if name == _item['name']] +# # _path = os.sep.join([plugins.CODE,_data[alias]['name']]) +# # _item = [_item for _item in plugins.inspect(_path) if name == _item['name']] - # _item = _item[0] if _item else None - # if _item : +# # _item = _item[0] if _item else None +# # if _item : - # return _item['pointer'] - # return None - _item = plugins.has(**_args) - return _item['pointer'] if _item else None +# # return _item['pointer'] +# # return None +# _item = plugins.has(**_args) +# return _item['pointer'] if _item else None - @staticmethod - def has (**_args): - f = open(plugins.FILE) - _data = json.loads(f.read()) - f.close() - if 'key' in _args : - alias,name = _args['key'].split('.') if '.' in _args['key'] else _args['key'].split('@') - else : - alias = _args['alias'] - name = _args['name'] +# @staticmethod +# def has (**_args): +# f = open(plugins.FILE) +# _data = json.loads(f.read()) +# f.close() +# if 'key' in _args : +# alias,name = _args['key'].split('.') if '.' in _args['key'] else _args['key'].split('@') +# else : +# alias = _args['alias'] +# name = _args['name'] - if alias in _data : +# if alias in _data : - _path = os.sep.join([plugins.CODE,_data[alias]['name']]) - _item = [_item for _item in plugins.inspect(_path) if name == _item['name']] +# _path = os.sep.join([plugins.CODE,_data[alias]['name']]) +# _item = [_item for _item in plugins.inspect(_path) if name == _item['name']] - _item = _item[0] if _item else None - if _item : +# _item = _item[0] if _item else None +# if _item : - return copy.copy(_item) - return None - @staticmethod - def synch(): - pass +# return copy.copy(_item) +# return None +# @staticmethod +# def synch(): +# pass def isloaded (): return DATA not in [{},None] From 98ef8a848e93abc13634ab7868747d44f45f22a6 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 5 Mar 2025 22:11:37 -0600 Subject: [PATCH 254/271] bug fixes and dependencies --- setup.py | 2 +- transport/iowrapper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index a0cfeed..6888fe5 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ args = { "packages": find_packages(include=['info','transport', 'transport.*'])} args["keywords"]=['mongodb','duckdb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pyncclient','duckdb-engine','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','termcolor','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql','pyspark','pydrill','sqlalchemy_drill'] +args["install_requires"] = ['pyncclient','duckdb-engine','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','termcolor','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql','pyspark','pydrill','sqlalchemy_drill',"git+https://github.com/lnyemba/plugins-ix"] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] args['classifiers'] = ['Programming Language :: Python :: 3', diff --git a/transport/iowrapper.py b/transport/iowrapper.py index 396135e..ff49906 100644 --- a/transport/iowrapper.py +++ b/transport/iowrapper.py @@ -19,7 +19,7 @@ class IO: """ def __init__(self,**_args): _agent = _args['agent'] - plugins = _args['plugins'] if 'plugins' not in _args else None + plugins = _args['plugins'] if 'plugins' in _args else None self._agent = _agent self._ixloader = plugin_ix.Loader () #-- From 0977ad1b181cf8f1d2e583b36b09d2ccdf569816 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Wed, 5 Mar 2025 22:19:06 -0600 Subject: [PATCH 255/271] setup fixes --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6888fe5..503a2d6 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ args = { "packages": find_packages(include=['info','transport', 'transport.*'])} args["keywords"]=['mongodb','duckdb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pyncclient','duckdb-engine','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','termcolor','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql','pyspark','pydrill','sqlalchemy_drill',"git+https://github.com/lnyemba/plugins-ix"] +args["install_requires"] = ['pyncclient','duckdb-engine','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','termcolor','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql','pyspark','pydrill','sqlalchemy_drill','plugin-ix@git+https://github.com/lnyemba/plugins-ix'] args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] args['classifiers'] = ['Programming Language :: Python :: 3', From 4b34c746ae80fbbb59ca428c2512b33570457945 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Thu, 10 Apr 2025 20:51:45 -0500 Subject: [PATCH 256/271] bug fix: missing table --- info/__init__.py | 2 +- transport/sql/common.py | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/info/__init__.py b/info/__init__.py index 9355349..98966f9 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,6 +1,6 @@ __app_name__ = 'data-transport' __author__ = 'The Phi Technology' -__version__= '2.2.10' +__version__= '2.2.12' __email__ = "info@the-phi.com" __edition__= 'community' __license__=f""" diff --git a/transport/sql/common.py b/transport/sql/common.py index 304e945..7cf303f 100644 --- a/transport/sql/common.py +++ b/transport/sql/common.py @@ -56,11 +56,17 @@ class Base: # _map = {'BIGINT':'INTEGER','TEXT':'STRING','DOUBLE_PRECISION':'FLOAT','NUMERIC':'FLOAT','DECIMAL':'FLOAT','REAL':'FLOAT'} # _schema = [{"name":_attr.name,"type":_map.get(str(_attr.type),str(_attr.type))} for _attr in _handler.tables[_table].columns] # + try: + if _table : + _inspector = inspect(self._engine) + _columns = _inspector.get_columns(_table) + _schema = [{'name':column['name'],'type':_map.get(str(column['type']),str(column['type'])) } for column in _columns] + return _schema + except Exception as e: + pass - _inspector = inspect(self._engine) - _columns = _inspector.get_columns(_table) - _schema = [{'name':column['name'],'type':_map.get(str(column['type']),str(column['type'])) } for column in _columns] - return _schema + # else: + return [] def has(self,**_args): return self.meta(**_args) def apply(self,sql): @@ -137,8 +143,9 @@ class BaseWriter (SQLBase): super().__init__(**_args) def write(self,_data,**_args): + if type(_data) == dict : - _df = pd.DataFrame(_data) + _df = pd.DataFrame([_data]) elif type(_data) == list : _df = pd.DataFrame(_data) else: From b0cd0b85dce58d7747a0a03962431bdba8a2b71c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 16 May 2025 11:02:18 -0500 Subject: [PATCH 257/271] bug fix: logger issue --- info/__init__.py | 2 +- transport/iowrapper.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/info/__init__.py b/info/__init__.py index 98966f9..3e6c1ca 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,6 +1,6 @@ __app_name__ = 'data-transport' __author__ = 'The Phi Technology' -__version__= '2.2.12' +__version__= '2.2.14' __email__ = "info@the-phi.com" __edition__= 'community' __license__=f""" diff --git a/transport/iowrapper.py b/transport/iowrapper.py index ff49906..3fc94c7 100644 --- a/transport/iowrapper.py +++ b/transport/iowrapper.py @@ -22,7 +22,8 @@ class IO: plugins = _args['plugins'] if 'plugins' in _args else None self._agent = _agent - self._ixloader = plugin_ix.Loader () #-- + # self._ixloader = plugin_ix.Loader () #-- must indicate where the plugin registry file is + self._ixloader = plugin_ix.Loader (registry=plugin_ix.Registry(folder=transport.registry.REGISTRY_PATH)) if plugins : self.init_plugins(plugins) # for _ref in plugins : From 6f8019f5829189deea34590e9c09667631abe5d3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 16 Jun 2025 11:30:49 -0500 Subject: [PATCH 258/271] bug fix --- info/__init__.py | 2 +- transport/warehouse/iceberg.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/info/__init__.py b/info/__init__.py index 3e6c1ca..19b0e10 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,6 +1,6 @@ __app_name__ = 'data-transport' __author__ = 'The Phi Technology' -__version__= '2.2.14' +__version__= '2.2.16' __email__ = "info@the-phi.com" __edition__= 'community' __license__=f""" diff --git a/transport/warehouse/iceberg.py b/transport/warehouse/iceberg.py index 4e73c62..3def181 100644 --- a/transport/warehouse/iceberg.py +++ b/transport/warehouse/iceberg.py @@ -74,7 +74,7 @@ class Iceberg : """ sql query/command to run against apache iceberg """ - return self._session.sql(_query) + return self._session.sql(_query).toPandas() def has (self,**_args): try: _prefix = self._getPrefix(**_args) From e035f5eba0812006bab16e4bb117fef387775927 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 23 Jun 2025 04:08:14 -0500 Subject: [PATCH 259/271] windows bug fix, environment variable --- info/__init__.py | 2 +- transport/iowrapper.py | 19 +------------------ transport/registry.py | 2 +- 3 files changed, 3 insertions(+), 20 deletions(-) diff --git a/info/__init__.py b/info/__init__.py index 19b0e10..ef03a5b 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,6 +1,6 @@ __app_name__ = 'data-transport' __author__ = 'The Phi Technology' -__version__= '2.2.16' +__version__= '2.2.18' __email__ = "info@the-phi.com" __edition__= 'community' __license__=f""" diff --git a/transport/iowrapper.py b/transport/iowrapper.py index 3fc94c7..6fd1196 100644 --- a/transport/iowrapper.py +++ b/transport/iowrapper.py @@ -26,24 +26,7 @@ class IO: self._ixloader = plugin_ix.Loader (registry=plugin_ix.Registry(folder=transport.registry.REGISTRY_PATH)) if plugins : self.init_plugins(plugins) - # for _ref in plugins : - # self._ixloader.set(_ref) - # if plugins : - # self._init_plugins(plugins) - # else: - # self._plugins = None - - # def _init_plugins(self,_args): - # """ - # This function will load pipelined functions as a plugin loader - # """ - # if 'path' in _args and 'names' in _args : - # self._plugins = PluginLoader(**_args) - # else: - # self._plugins = PluginLoader() - # [self._plugins.set(_pointer) for _pointer in _args] - # - # @TODO: We should have a way to log what plugins are loaded and ready to use + def meta (self,**_args): if hasattr(self._agent,'meta') : return self._agent.meta(**_args) diff --git a/transport/registry.py b/transport/registry.py index 71909f6..8853069 100644 --- a/transport/registry.py +++ b/transport/registry.py @@ -13,7 +13,7 @@ This class manages data from the registry and allows (read only) @TODO: add property to the DATA attribute """ -REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport']) +REGISTRY_PATH=os.sep.join([os.environ.get('HOME','USERPROFILE'),'.data-transport']) # # This path can be overriden by an environment variable ... # From de4e065ca69e4027b6d4e002eff729754f5db65e Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 23 Jun 2025 10:25:14 -0500 Subject: [PATCH 260/271] bug fix with newer setuptools --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 503a2d6..119baab 100644 --- a/setup.py +++ b/setup.py @@ -24,8 +24,9 @@ args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" args['scripts'] = ['bin/transport'] args['classifiers'] = ['Programming Language :: Python :: 3', 'License :: OSI Approved :: MIT License', - 'Operating System :: OS Independent', - ], + "Topic :: Utilities", + 'Operating System :: OS Independent' + ] # if sys.version_info[0] == 2 : # args['use_2to3'] = True # args['use_2to3_exclude_fixers']=['lib2to3.fixes.fix_import'] From 66d881fdda720119d0fa94029c66fc926e66322b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 30 Jun 2025 15:16:30 -0500 Subject: [PATCH 261/271] upgrade pyproject.toml, bug fix with registry --- info/__init__.py | 2 +- setup.py | 33 --------- transport/registry.py | 163 ++---------------------------------------- 3 files changed, 7 insertions(+), 191 deletions(-) delete mode 100644 setup.py diff --git a/info/__init__.py b/info/__init__.py index ef03a5b..501211f 100644 --- a/info/__init__.py +++ b/info/__init__.py @@ -1,6 +1,6 @@ __app_name__ = 'data-transport' __author__ = 'The Phi Technology' -__version__= '2.2.18' +__version__= '2.2.22' __email__ = "info@the-phi.com" __edition__= 'community' __license__=f""" diff --git a/setup.py b/setup.py deleted file mode 100644 index 119baab..0000000 --- a/setup.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -This is a build file for the -""" -from setuptools import setup, find_packages -import os -import sys -# from version import __version__,__author__ -from info import __version__, __author__,__app_name__,__license__,__edition__ - - -def read(fname): - return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = { - "name":__app_name__, - "version":__version__, - "author":__author__,"author_email":"info@the-phi.com", - "license":__license__, - # "packages":["transport","info","transport/sql"]}, - - "packages": find_packages(include=['info','transport', 'transport.*'])} -args["keywords"]=['mongodb','duckdb','couchdb','rabbitmq','file','read','write','s3','sqlite'] -args["install_requires"] = ['pyncclient','duckdb-engine','pymongo','sqlalchemy','pandas','typer','pandas-gbq','numpy','cloudant','pika','nzpy','termcolor','boto3','boto','pyarrow','google-cloud-bigquery','google-cloud-bigquery-storage','flask-session','smart_open','botocore','psycopg2-binary','mysql-connector-python','numpy','pymssql','pyspark','pydrill','sqlalchemy_drill','plugin-ix@git+https://github.com/lnyemba/plugins-ix'] -args["url"] = "https://healthcareio.the-phi.com/git/code/transport.git" -args['scripts'] = ['bin/transport'] -args['classifiers'] = ['Programming Language :: Python :: 3', - 'License :: OSI Approved :: MIT License', - "Topic :: Utilities", - 'Operating System :: OS Independent' - ] -# if sys.version_info[0] == 2 : -# args['use_2to3'] = True -# args['use_2to3_exclude_fixers']=['lib2to3.fixes.fix_import'] -setup(**args) diff --git a/transport/registry.py b/transport/registry.py index 8853069..1c9443e 100644 --- a/transport/registry.py +++ b/transport/registry.py @@ -6,7 +6,7 @@ import transport import importlib import importlib.util import shutil - +from io import StringIO """ This class manages data from the registry and allows (read only) @@ -21,161 +21,7 @@ if 'DATA_TRANSPORT_REGISTRY_PATH' in os.environ : REGISTRY_PATH = os.environ['DATA_TRANSPORT_REGISTRY_PATH'] REGISTRY_FILE= 'transport-registry.json' DATA = {} -# class plugins: -# # -# # This is a utility function that should enable management of plugins-registry -# # The class allows to add/remove elements -# # -# # @TODO: add read/write properties to the class (better design practice) -# # -# _data = {} -# FOLDER = os.sep.join([REGISTRY_PATH,'plugins']) -# CODE = os.sep.join([REGISTRY_PATH,'plugins','code']) -# FILE = os.sep.join([REGISTRY_PATH,'plugin-registry.json']) -# @staticmethod -# def init(): - -# if not os.path.exists(plugins.FOLDER) : -# os.makedirs(plugins.FOLDER) -# if not os.path.exists(plugins.CODE): -# os.makedirs(plugins.CODE) -# if not os.path.exists(plugins.FILE): -# f = open(plugins.FILE,'w') -# f.write("{}") -# f.close() -# plugins._read() #-- will load data as a side effect - -# @staticmethod -# def copy (path) : - -# shutil.copy2(path,plugins.CODE) -# @staticmethod -# def _read (): -# f = open(plugins.FILE) -# try: -# _data = json.loads(f.read()) -# f.close() -# except Exception as e: -# print (f"Corrupted registry, resetting ...") -# _data = {} -# plugins._write(_data) - -# plugins._data = _data -# @staticmethod -# def _write (_data): -# f = open(plugins.FILE,'w') -# f.write(json.dumps(_data)) -# f.close() -# plugins._data = _data - -# @staticmethod -# def inspect (_path): -# _names = [] - -# if os.path.exists(_path) : -# _filename = _path.split(os.sep)[-1] -# spec = importlib.util.spec_from_file_location(_filename, _path) -# module = importlib.util.module_from_spec(spec) -# spec.loader.exec_module(module) -# # _names = [{'name':getattr(getattr(module,_name),'name'),'pointer':getattr(module,_name)} for _name in dir(module) if type( getattr(module,_name)).__name__ == 'function'] -# for _name in dir(module) : -# _pointer = getattr(module,_name) -# if hasattr(_pointer,'transport') : -# _item = {'real_name':_name,'name':getattr(_pointer,'name'),'pointer':_pointer,'version':getattr(_pointer,'version')} -# _names.append(_item) - - -# return _names -# @staticmethod -# def add (alias,path): -# """ -# Add overwrite the registry entries -# """ -# _names = plugins.inspect (path) -# _log = [] - -# if _names : -# # -# # We should make sure we have all the plugins with the attributes (transport,name) set -# _names = [_item for _item in _names if hasattr(_item['pointer'],'transport') ] -# if _names : -# plugins.copy(path) -# _content = [] - -# for _item in _names : -# _key = '@'.join([alias,_item['name']]) -# _log.append(_item['name']) -# # -# # Let us update the registry -# # -# plugins.update(alias,path,_log) -# return _log - -# @staticmethod -# def update (alias,path,_log) : -# """ -# updating the registry entries of the plugins (management data) -# """ -# # f = open(plugins.FILE) -# # _data = json.loads(f.read()) -# # f.close() -# _data = plugins._data -# # _log = plugins.add(alias,path) - -# if _log : -# _data[alias] = {'content':_log,'name':path.split(os.sep)[-1]} -# plugins._write(_data) #-- will update data as a side effect - -# return _log -# @staticmethod -# def get(**_args) : -# # f = open(plugins.FILE) -# # _data = json.loads(f.read()) -# # f.close() -# # if 'key' in _args : -# # alias,name = _args['key'].split('.') if '.' in _args['key'] else _args['key'].split('@') -# # else : -# # alias = _args['alias'] -# # name = _args['name'] - -# # if alias in _data : - -# # _path = os.sep.join([plugins.CODE,_data[alias]['name']]) -# # _item = [_item for _item in plugins.inspect(_path) if name == _item['name']] - -# # _item = _item[0] if _item else None -# # if _item : - -# # return _item['pointer'] -# # return None -# _item = plugins.has(**_args) -# return _item['pointer'] if _item else None - -# @staticmethod -# def has (**_args): -# f = open(plugins.FILE) -# _data = json.loads(f.read()) -# f.close() -# if 'key' in _args : -# alias,name = _args['key'].split('.') if '.' in _args['key'] else _args['key'].split('@') -# else : -# alias = _args['alias'] -# name = _args['name'] - -# if alias in _data : - -# _path = os.sep.join([plugins.CODE,_data[alias]['name']]) -# _item = [_item for _item in plugins.inspect(_path) if name == _item['name']] - -# _item = _item[0] if _item else None -# if _item : - -# return copy.copy(_item) -# return None -# @staticmethod -# def synch(): -# pass def isloaded (): return DATA not in [{},None] @@ -233,8 +79,11 @@ def set (label, auth_file, default=False,path=REGISTRY_PATH) : if label == 'default' : raise Exception ("""Invalid label name provided, please change the label name and use the switch""") reg_file = os.sep.join([path,REGISTRY_FILE]) - if os.path.exists (auth_file) and os.path.exists(path) and os.path.exists(reg_file): - f = open(auth_file) + if os.path.exists(path) and os.path.exists(reg_file): + if type(auth_file) == str and os.path.exists (auth_file) : + f = open(auth_file) + elif type(auth_file) == StringIO: + f = auth_file _info = json.loads(f.read()) f.close() f = open(reg_file) From 6e1c4209529c339aba0d20e565d517e981d770d5 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 30 Jun 2025 15:51:50 -0500 Subject: [PATCH 262/271] project file specification --- pyproject.toml | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b61e7e5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,62 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "data-transport" +dynamic = ["version"] +authors = [ + {name="Steve L. Nyemba" , email = "info@the-phi.com"}, +] +description = "" +readme = "README.md" +license = {text = "LICENSE"} +keywords = ["mongodb","duckdb","couchdb","rabbitmq","file","read","write","s3","sqlite"] +classifiers = [ + "License :: OSI Approved :: MIT License", + "Topic :: Utilities", +] +dependencies = [ + "termcolor","sqlalchemy", "aiosqlite","duckdb-engine", + "typer","pandas","numpy","sqlalchemy","pyarrow", + "plugin-ix@git+https://github.com/lnyemba/plugins-ix" +] +[project.optional-dependencies] +sql = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite"] +nosql = ["pymongo","cloudant"] +cloud = ["pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore"] +warehouse = ["pydrill","pyspark","sqlalchemy_drill"] +rabbitmq = ["pika"] +sqlite = ["aiosqlite"] +aws3 = ["boto3","boto","botocore"] +nextcloud = ["pyncclient"] +mongodb = ["pymongo"] +netezza = ["nzpy"] +mysql = ["mysql-connector-python"] +postgresql = ["psycopg2-binary"] +sqlserver = ["pymssql"] +http = ["flask-session"] +all = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite","pymongo","cloudant","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore","pydrill","pyspark","sqlalchemy_drill", "pika","aiosqlite","boto3","boto","botocore", "pyncclient"] + +[project.urls] +Homepage = "https://healthcareio.the-phi.com/git/code/transport.git" + +#[project.scripts] +#transport = "transport:main" + +[tool.setuptools] +include-package-data = true +zip-safe = false +script-files = ["bin/transport"] + +[tool.setuptools.packages.find] +include = ["info","info.*", "transport", "transport.*"] + +[tool.setuptools.dynamic] +version = {attr = "info.__version__"} +#authors = {attr = "meta.__author__"} + +# If you have a info.py file, you might also want to include the author dynamically: +# [tool.setuptools.dynamic] +# version = {attr = "info.__version__"} +# authors = {attr = "info.__author__"} From fbdb4a493135f8a91275b98013d04f4207f87e7b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 4 Jul 2025 16:54:52 -0500 Subject: [PATCH 263/271] bug fix: registry and emails --- transport/registry.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/transport/registry.py b/transport/registry.py index 1c9443e..196b2f0 100644 --- a/transport/registry.py +++ b/transport/registry.py @@ -12,8 +12,11 @@ from io import StringIO This class manages data from the registry and allows (read only) @TODO: add property to the DATA attribute """ +if 'HOME' in os.environ : + REGISTRY_PATH=os.sep.join([os.environ['HOME'],'.data-transport']) +else: + REGISTRY_PATH=os.sep.join([os.environ['USERPROFILE'],'.data-transport']) -REGISTRY_PATH=os.sep.join([os.environ.get('HOME','USERPROFILE'),'.data-transport']) # # This path can be overriden by an environment variable ... # From befdf453f502ad212eef80155d70919f34990894 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Fri, 4 Jul 2025 16:57:30 -0500 Subject: [PATCH 264/271] bug fix: crash with etl & process --- bin/transport | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/transport b/bin/transport index 19b664e..41c1a75 100755 --- a/bin/transport +++ b/bin/transport @@ -53,9 +53,9 @@ def wait(jobs): while jobs : jobs = [thread for thread in jobs if thread.is_alive()] time.sleep(1) -def wait (jobs): - while jobs : - jobs = [pthread for pthread in jobs if pthread.is_alive()] +# def wait (jobs): +# while jobs : +# jobs = [pthread for pthread in jobs if pthread.is_alive()] @app_e.command(name="run") def apply (path:Annotated[str,typer.Argument(help="path of the configuration file")], From be10ae17d78154e87ac59c81bb9950562cc44d56 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 8 Jul 2025 10:09:43 -0500 Subject: [PATCH 265/271] bug fixes: installer & registry --- pyproject.toml | 1 + transport/registry.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b61e7e5..159e9cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ classifiers = [ ] dependencies = [ "termcolor","sqlalchemy", "aiosqlite","duckdb-engine", + "mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite", "typer","pandas","numpy","sqlalchemy","pyarrow", "plugin-ix@git+https://github.com/lnyemba/plugins-ix" ] diff --git a/transport/registry.py b/transport/registry.py index 196b2f0..050b82d 100644 --- a/transport/registry.py +++ b/transport/registry.py @@ -49,7 +49,8 @@ def init (email,path=REGISTRY_PATH,override=False,_file=REGISTRY_FILE): Initializing the registry and will raise an exception in the advent of an issue """ p = '@' in email - q = False if '.' not in email else email.split('.')[-1] in ['edu','com','io','ai','org'] + #q = False if '.' not in email else email.split('.')[-1] in ['edu','com','io','ai','org'] + q = len(email.split('.')[-1]) in [2,3] if p and q : _config = {"email":email,'version':__version__} if not os.path.exists(path): From f06d26f9b676332136f03fdf9891962b52e61a28 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 8 Jul 2025 11:46:27 -0500 Subject: [PATCH 266/271] bug fixes:installer & imports --- pyproject.toml | 15 +++------------ transport/__init__.py | 22 +++++++++++++++++++++- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 159e9cb..c0d8a4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,24 +19,15 @@ classifiers = [ dependencies = [ "termcolor","sqlalchemy", "aiosqlite","duckdb-engine", "mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite", - "typer","pandas","numpy","sqlalchemy","pyarrow", + "typer","pandas","numpy","sqlalchemy","pyarrow","smart-open", "plugin-ix@git+https://github.com/lnyemba/plugins-ix" ] [project.optional-dependencies] sql = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite"] nosql = ["pymongo","cloudant"] -cloud = ["pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore"] +cloud = ["boto","boto3","botocore","pyncclient","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore"] warehouse = ["pydrill","pyspark","sqlalchemy_drill"] -rabbitmq = ["pika"] -sqlite = ["aiosqlite"] -aws3 = ["boto3","boto","botocore"] -nextcloud = ["pyncclient"] -mongodb = ["pymongo"] -netezza = ["nzpy"] -mysql = ["mysql-connector-python"] -postgresql = ["psycopg2-binary"] -sqlserver = ["pymssql"] -http = ["flask-session"] +other = ["pika","flask-session"] all = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite","pymongo","cloudant","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore","pydrill","pyspark","sqlalchemy_drill", "pika","aiosqlite","boto3","boto","botocore", "pyncclient"] [project.urls] diff --git a/transport/__init__.py b/transport/__init__.py index c3bb901..bcc8904 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -18,7 +18,27 @@ Source Code is available under MIT License: """ import numpy as np -from transport import sql, nosql, cloud, other, warehouse +#from transport import sql, nosql, cloud, other, warehouse +from transport import sql +try: + from transport import nosql +finally: + pass +try: + from transport import cloud +finally: + pass +try: + from transport import warehouse +finally: + pass +try: + from transport import other +finally: + pass + + + import pandas as pd import json import os From 18c54d7664c4cdeeff4c8432e529e049dbaa052c Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 8 Jul 2025 12:02:38 -0500 Subject: [PATCH 267/271] bug fixes --- transport/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transport/__init__.py b/transport/__init__.py index bcc8904..a96b4f7 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -22,19 +22,19 @@ import numpy as np from transport import sql try: from transport import nosql -finally: +except Exception as e: pass try: from transport import cloud -finally: +except Exception as e: pass try: from transport import warehouse -finally: +except Exception as e: pass try: from transport import other -finally: +except Exception as e: pass From 6e753a1fcd8d704e3392f92680e4e5eae2d13779 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 8 Jul 2025 12:03:45 -0500 Subject: [PATCH 268/271] bug fixes --- transport/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transport/__init__.py b/transport/__init__.py index a96b4f7..583b9d8 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -23,19 +23,19 @@ from transport import sql try: from transport import nosql except Exception as e: - pass + nosql = {} try: from transport import cloud except Exception as e: - pass + cloud = {} try: from transport import warehouse except Exception as e: - pass + warehouse = {} try: from transport import other except Exception as e: - pass + other = {} From 89d762f39ab1c12cf0cc4f7c69a86448b151413b Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 8 Jul 2025 12:14:10 -0500 Subject: [PATCH 269/271] bug fixes: conditional imports --- transport/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transport/__init__.py b/transport/__init__.py index 583b9d8..6937189 100644 --- a/transport/__init__.py +++ b/transport/__init__.py @@ -55,7 +55,7 @@ def init(): global PROVIDERS for _module in [cloud,sql,nosql,other,warehouse] : for _provider_name in dir(_module) : - if _provider_name.startswith('__') or _provider_name == 'common': + if _provider_name.startswith('__') or _provider_name == 'common' or type(_module) in [None,str,dict]: continue PROVIDERS[_provider_name] = {'module':getattr(_module,_provider_name),'type':_module.__name__} # From a31481e19612a5b90400cee4d1544a8a63eb1ebf Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 8 Jul 2025 14:11:07 -0500 Subject: [PATCH 270/271] fix --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c0d8a4f..742915d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,12 +23,12 @@ dependencies = [ "plugin-ix@git+https://github.com/lnyemba/plugins-ix" ] [project.optional-dependencies] -sql = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite"] +#sql = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite"] nosql = ["pymongo","cloudant"] cloud = ["boto","boto3","botocore","pyncclient","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore"] warehouse = ["pydrill","pyspark","sqlalchemy_drill"] other = ["pika","flask-session"] -all = ["mysql-connector-python","psycopg2-binary","nzpy","pymssql","duckdb-engine","aiosqlite","pymongo","cloudant","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore","pydrill","pyspark","sqlalchemy_drill", "pika","aiosqlite","boto3","boto","botocore", "pyncclient"] +all = ["pymongo","cloudant","pandas-gbq","google-cloud-bigquery","google-cloud-bigquery-storage", "databricks-sqlalchemy","pyncclient","boto3","boto","botocore","pydrill","pyspark","sqlalchemy_drill", "pika","aiosqlite","boto3","boto","botocore", "pyncclient"] [project.urls] Homepage = "https://healthcareio.the-phi.com/git/code/transport.git" From 4c2efc28924b543306768dda240cbde6b7eae034 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Mon, 21 Jul 2025 13:10:50 -0500 Subject: [PATCH 271/271] documentation ... readme --- README.md | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 7d8b414..577350e 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,11 @@ This project implements an abstraction of objects that can have access to a vari # Why Use Data-Transport ? -Mostly data scientists that don't really care about the underlying database and would like a simple and consistent way to read/write and move data are well served. Additionally we implemented lightweight Extract Transform Loading API and command line (CLI) tool. Finally it is possible to add pre/post processing pipeline functions to read/write - -1. Familiarity with **pandas data-frames** -2. Connectivity **drivers** are included -3. Reading/Writing data from various sources -4. Useful for data migrations or **ETL** +Data transport is a simple framework that: +- easy to install & modify (open-source) +- enables access to multiple database technologies (pandas, SQLAlchemy) +- enables notebook sharing without exposing database credential. +- supports pre/post processing specifications (pipeline) ## Installation @@ -18,19 +17,16 @@ Within the virtual environment perform the following : pip install git+https://github.com/lnyemba/data-transport.git -## Features +Options to install components in square brackets - - read/write from over a dozen databases - - run ETL jobs seamlessly - - scales and integrates into shared environments like apache zeppelin; jupyterhub; SageMaker; ... + pip install data-transport[nosql,cloud,warehouse,all]@git+https://github.com/lnyemba/data-transport.git -## What's new -Unlike older versions 2.0 and under, we focus on collaborative environments like jupyter-x servers; apache zeppelin: +## Additional features - 1. Simpler syntax to create reader or writer - 2. auth-file registry that can be referenced using a label - 3. duckdb support + - In addition to read/write, there is support for functions for pre/post processing + - CLI interface to add to registry, run ETL + - scales and integrates into shared environments like apache zeppelin; jupyterhub; SageMaker; ... ## Learn More