bug fix: ETL logging and rabbitmq-server listener

pull/1/head
Steve Nyemba 2 years ago
parent 67cb7de861
commit 8cd34d902a

@ -68,11 +68,13 @@ class factory :
"mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my}, "mariadb":{"port":3306,"host":"localhost","default":{"type":"VARCHAR(256)"},"driver":my},
"mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}}, "mongo":{"port":27017,"host":"localhost","class":{"read":mongo.MongoReader,"write":mongo.MongoWriter}},
"couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}}, "couch":{"port":5984,"host":"localhost","class":{"read":couch.CouchReader,"write":couch.CouchWriter}},
"netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"}}} "netezza":{"port":5480,"driver":nz,"default":{"type":"VARCHAR(256)"}},
"rabbitmq":{"port":5672,"host":"localhost","class":{"read":queue.QueueReader,"write":queue.QueueWriter,"listen":queue.QueueListener},"default":{"type":"application/json"}}}
# #
# creating synonyms # creating synonyms
PROVIDERS['mongodb'] = PROVIDERS['mongo'] PROVIDERS['mongodb'] = PROVIDERS['mongo']
PROVIDERS['couchdb'] = PROVIDERS['couch'] PROVIDERS['couchdb'] = PROVIDERS['couch']
PROVIDERS['bq'] = PROVIDERS['bigquery']
PROVIDERS['sqlite3'] = PROVIDERS['sqlite'] PROVIDERS['sqlite3'] = PROVIDERS['sqlite']
@staticmethod @staticmethod
@ -124,7 +126,7 @@ def instance(**_args):
provider = _args['provider'] provider = _args['provider']
context = _args['context']if 'context' in _args else None context = _args['context']if 'context' in _args else None
_id = context if context in ['read','write'] else 'read' _id = context if context in list(factory.PROVIDERS[provider]['class'].keys()) else 'read'
if _id : if _id :
args = {'provider':_id} args = {'provider':_id}
for key in factory.PROVIDERS[provider] : for key in factory.PROVIDERS[provider] :
@ -147,7 +149,7 @@ def instance(**_args):
try: try:
host = '' host = ''
if provider not in ['bigquery','mongodb','couchdb','sqlite','console','etl','file'] : if provider not in ['bigquery','mongodb','couchdb','sqlite','console','etl','file','rabbitmq'] :
# #
# In these cases we are assuming RDBMS and thus would exclude NoSQL and BigQuery # In these cases we are assuming RDBMS and thus would exclude NoSQL and BigQuery
username = args['username'] if 'username' in args else '' username = args['username'] if 'username' in args else ''
@ -165,7 +167,7 @@ def instance(**_args):
account = '' account = ''
host = '' host = ''
database = args['path'] if 'path' in args else args['database'] database = args['path'] if 'path' in args else args['database']
if provider not in ['mongodb','couchdb','bigquery','console','etl','file'] : if provider not in ['mongodb','couchdb','bigquery','console','etl','file','rabbitmq'] :
uri = ''.join([provider,"://",account,host,'/',database]) uri = ''.join([provider,"://",account,host,'/',database])
e = sqlalchemy.create_engine (uri,future=True) e = sqlalchemy.create_engine (uri,future=True)

@ -98,15 +98,15 @@ class Console(Writer):
self.debug = self.write self.debug = self.write
self.log = self.write self.log = self.write
pass pass
def write (self,info,**_args): def write (self,**_args):
if self.lock : if self.lock :
Console.lock.acquire() Console.lock.acquire()
try: try:
if type(info) == list: if type(_args) == list:
for row in info : for row in _args :
print (row) print (row)
else: else:
print (info) print (_args)
except Exception as e : except Exception as e :
print (e) print (e)
finally: finally:

@ -54,41 +54,46 @@ if len(sys.argv) > 1:
i += 2 i += 2
class Post(Process): class Post(Process):
def __init__(self,**args): def __init__(self,**args):
super().__init__() super().__init__()
if 'provider' not in args['target'] : if 'provider' not in args['target'] :
self.PROVIDER = args['target']['type'] self.PROVIDER = args['target']['type']
self.writer = transport.factory.instance(**args['target']) self.writer = transport.factory.instance(**args['target'])
else: else:
self.PROVIDER = args['target']['provider'] self.PROVIDER = args['target']['provider']
args['target']['context'] = 'write' args['target']['context'] = 'write'
self.store = args['target'] self.store = args['target']
# self.writer = transport.instance(**args['target']) self.store['lock'] = True
# # self.writer = transport.instance(**args['target'])
# If the table doesn't exists maybe create it ? #
# # If the table doesn't exists maybe create it ?
self.rows = args['rows'].fillna('') #
self.rows = args['rows'].fillna('')
def run(self): def log(self,**_args) :
_info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows if ETL.logger :
ltypes = self.rows.dtypes.values ETL.logger.info(**_args)
columns = self.rows.dtypes.index.tolist()
# if not self.writer.has() : def run(self):
_info = {"values":self.rows} if 'couch' in self.PROVIDER else self.rows
ltypes = self.rows.dtypes.values
# self.writer.make(fields=columns) columns = self.rows.dtypes.index.tolist()
# ETL.logger.info(module='write',action='make-table',input={"name":self.writer.table}) # if not self.writer.has() :
for name in columns :
if _info[name].dtype in ['int32','int64','int','float','float32','float64'] :
value = 0 # self.writer.make(fields=columns)
else: # ETL.logger.info(module='write',action='make-table',input={"name":self.writer.table})
value = '' self.log(module='write',action='make-table',input={"schema":columns})
_info[name] = _info[name].fillna(value) for name in columns :
writer = transport.factory.instance(**self.store) if _info[name].dtype in ['int32','int64','int','float','float32','float64'] :
writer.write(_info) value = 0
writer.close() else:
value = ''
_info[name] = _info[name].fillna(value)
writer = transport.factory.instance(**self.store)
writer.write(_info)
writer.close()
class ETL (Process): class ETL (Process):
@ -115,8 +120,9 @@ class ETL (Process):
self.jobs = [] self.jobs = []
# self.logger = transport.factory.instance(**_args['logger']) # self.logger = transport.factory.instance(**_args['logger'])
def log(self,**_args) : def log(self,**_args) :
_args['name'] = self.name if ETL.logger :
print (_args) ETL.logger.info(**_args)
def run(self): def run(self):
if self.cmd : if self.cmd :
idf = self.reader.read(**self.cmd) idf = self.reader.read(**self.cmd)
@ -126,7 +132,7 @@ class ETL (Process):
# idf = idf.replace({np.nan: None}, inplace = True) # idf = idf.replace({np.nan: None}, inplace = True)
idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()] idf.columns = [str(name).replace("b'",'').replace("'","").strip() for name in idf.columns.tolist()]
# ETL.logger.info(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT) self.log(rows=idf.shape[0],cols=idf.shape[1],jobs=self.JOB_COUNT)
# #
# writing the data to a designated data source # writing the data to a designated data source
@ -134,7 +140,7 @@ class ETL (Process):
try: try:
# ETL.logger.info(module='write',action='partitioning') self.log(module='write',action='partitioning',jobs=self.JOB_COUNT)
rows = np.array_split(np.arange(0,idf.shape[0]),self.JOB_COUNT) rows = np.array_split(np.arange(0,idf.shape[0]),self.JOB_COUNT)
# #
@ -149,9 +155,9 @@ class ETL (Process):
self.jobs.append(proc) self.jobs.append(proc)
proc.start() proc.start()
# ETL.logger.info(module='write',action='working',segment=str(id),table=self.name,rows=segment.shape[0]) self.log(module='write',action='working',segment=str(self.name),table=self.name,rows=segment.shape[0])
# while poc : # while self.jobs :
# proc = [job for job in proc if job.is_alive()] # jobs = [job for job in proc if job.is_alive()]
# time.sleep(1) # time.sleep(1)
except Exception as e: except Exception as e:
print (e) print (e)
@ -166,9 +172,9 @@ def instance(**_args):
""" """
logger = _args['logger'] if 'logger' in _args else None logger = _args['logger'] if 'logger' in _args else None
_info = _args['info'] _info = _args['info']
if logger : if logger and type(logger) != str:
ETL.logger = logger ETL.logger = logger
else: elif logger == 'console':
ETL.logger = transport.factory.instance(provider='console',lock=True) ETL.logger = transport.factory.instance(provider='console',lock=True)
if type(_info) in [list,dict] : if type(_info) in [list,dict] :
_config = _info if type(_info) != dict else [_info] _config = _info if type(_info) != dict else [_info]
@ -195,8 +201,6 @@ if __name__ == '__main__' :
_config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}} _config['source'] = {"type":"disk.DiskReader","args":{"path":SYS_ARGS['source'],"delimiter":","}}
_config['jobs'] = 3 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs']) _config['jobs'] = 3 if 'jobs' not in SYS_ARGS else int(SYS_ARGS['jobs'])
print (_config)
print ()
etl = ETL (**_config) etl = ETL (**_config)
if index is None: if index is None:

@ -222,22 +222,21 @@ class QueueListener(MessageQueue):
def __init__(self,**args): def __init__(self,**args):
MessageQueue.__init__(self,**args) MessageQueue.__init__(self,**args)
self.listen = self.read self.listen = self.read
# def init(self,qid): self.apply = args['apply'] if 'apply' in args else print
# properties = pika.ConnectionParameters(host=self.host)
# self.connection = pika.BlockingConnection(properties)
# self.channel = self.connection.channel()
# self.channel.exchange_declare(exchange=self.exchange,type='direct',durable=True )
# self.info = self.channel.queue_declare(passive=True,exclusive=True,queue=qid)
# self.channel.queue_bind(exchange=self.exchange,queue=self.info.method.queue,routing_key=qid)
#self.callback = callback
def finalize(self,channel,ExceptionReason): def finalize(self,channel,ExceptionReason):
pass pass
def callback(self,channel,method,header,stream) : def callback(self,channel,method,header,stream) :
raise Exception("....") _info= {}
# if re.match("^\{|\[",stream) is not None:
if stream.startswith(b"[") or stream.startswith(b"{"):
_info = json.loads(stream)
else:
_info = stream
self.apply(_info)
def read(self): def read(self):
self.init(self.queue) self.init(self.queue)

@ -312,9 +312,11 @@ class BigQuery:
:param sql sql query to be pulled, :param sql sql query to be pulled,
""" """
table = _args['table'] table = _args['table']
try:
ref = self.client.dataset(self.dataset).table(table) ref = self.client.dataset(self.dataset).table(table)
return self.client.get_table(ref).schema return self.client.get_table(ref).schema
except Exception as e:
return []
def has(self,**_args): def has(self,**_args):
found = False found = False
try: try:

Loading…
Cancel
Save