bug fixes for version 1.0.8, streamlining interface

pull/1/head
Steve L. Nyemba 5 years ago
parent aaad4003a9
commit 081ed080d7

@ -6,7 +6,12 @@ import os
import sys
def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()
args = {"name":"data-transport","version":"1.0.0","author":"The Phi Technology LLC","author_email":"info@the-phi.com","license":"MIT","packages":["transport"]}
args = {
"name":"data-transport",
"version":"1.0.8",
"author":"The Phi Technology LLC","author_email":"info@the-phi.com",
"license":"MIT",
"packages":["transport"]}
args["keywords"]=['mongodb','couchdb','rabbitmq','file','read','write','s3']
args["install_requires"] = ['pymongo','numpy','cloudant','pika','boto','flask-session','smart_open']
args["url"] = "https://dev.the-phi.com/git/steve/data-transport.git"

@ -14,7 +14,8 @@ Requirements :
pymongo
boto
couldant
@TODO:
Enable read/writing to multiple reads/writes
"""
__author__ = 'The Phi Technology'
import numpy as np
@ -22,107 +23,72 @@ import json
import importlib
# import couch
# import mongo
class Reader:
def __init__(self):
self.nrows = 0
self.xchar = None
def row_count(self):
content = self.read()
return np.sum([1 for row in content])
def delimiter(self,sample):
class IO:
def init(self,**args):
"""
This function determines the most common delimiter from a subset of possible delimiters.
It uses a statistical approach (distribution) to guage the distribution of columns for a given delimiter
:sample sample string/content expecting matrix i.e list of rows
This function enables attributes to be changed at runtime. Only the attributes defined in the class can be changed
Adding attributes will require sub-classing otherwise we may have an unpredictable class ...
"""
m = {',':[],'\t':[],'|':[],'\x3A':[]}
delim = list(m.keys())
for row in sample:
for xchar in delim:
if row.split(xchar) > 1:
m[xchar].append(len(row.split(xchar)))
else:
m[xchar].append(0)
#
# The delimiter with the smallest variance, provided the mean is greater than 1
# This would be troublesome if there many broken records sampled
#
m = {id: np.var(m[id]) for id in list(m.keys()) if m[id] != [] and int(np.mean(m[id]))>1}
index = list(m.values()).index( min(m.values()))
xchar = list(m.keys())[index]
return xchar
def col_count(self,sample):
"""
This function retirms the number of columns of a given sample
@pre self.xchar is not None
allowed = list(vars(self).keys())
for field in args :
if field not in allowed :
continue
value = args[field]
setattr(self,field,value)
class Reader (IO):
"""
This class is an abstraction of a read functionalities of a data store
"""
def __init__(self):
pass
def meta(self):
"""
m = {}
i = 0
for row in sample:
row = self.format(row)
id = str(len(row))
#id = str(len(row.split(self.xchar)))
if id not in m:
m[id] = 0
m[id] = m[id] + 1
index = list(m.values()).index( max(m.values()) )
ncols = int(list(m.keys())[index])
return ncols;
def format (self,row):
This function is intended to return meta-data associated with what has just been read
@return object of meta data information associated with the content of the store
"""
This function will clean records of a given row by removing non-ascii characters
@pre self.xchar is not None
raise Exception ("meta function needs to be implemented")
def read(**args):
"""
if isinstance(row,list) == False:
#
# We've observed sometimes fields contain delimiter as a legitimate character, we need to be able to account for this and not tamper with the field values (unless necessary)
cols = self.split(row)
#cols = row.split(self.xchar)
else:
cols = row ;
return [ re.sub('[^\x00-\x7F,\n,\r,\v,\b,]',' ',col.strip()).strip().replace('"','') for col in cols]
def split (self,row):
This function is intended to read the content of a store provided parameters to be used at the discretion of the subclass
"""
This function performs a split of a record and tries to attempt to preserve the integrity of the data within i.e accounting for the double quotes.
@pre : self.xchar is not None
"""
pattern = "".join(["(?:^|",self.xchar,")(\"(?:[^\"]+|\"\")*\"|[^",self.xchar,"]*)"])
return re.findall(pattern,row.replace('\n',''))
raise Exception ("read function needs to be implemented")
class Writer:
class Writer(IO):
def __init__(self):
self.cache = {"default":[]}
def log(self,**args):
self.cache[id] = args
def meta (self,id="default",**args):
raise Exception ("meta function needs to be implemented")
def format(self,row,xchar):
if xchar is not None and isinstance(row,list):
return xchar.join(row)+'\n'
elif xchar is None and isinstance(row,dict):
row = json.dumps(row)
return row
"""
def write(self,**args):
"""
This function will write content to a store given parameters to be used at the discretion of the sub-class
"""
raise Exception ("write function needs to be implemented")
def archive(self):
"""
It is important to be able to archive data so as to insure that growth is controlled
Nothing in nature grows indefinitely neither should data being handled.
"""
def archive(self):
pass
def flush(self):
"""
raise Exception ("archive function needs to be implemented")
def close(self):
"""
This function will close the persistent storage connection/handler
"""
pass
class ReadWriter(Reader,Writer) :
"""
This class implements the read/write functions aggregated
"""
pass
# class factory :
# @staticmethod
# def instance(**args):

@ -15,13 +15,13 @@ else:
class Couch:
"""
This class is a wrapper for read/write against couchdb. The class captures common operations for read/write.
@param url host & port reference
@param url host & port reference default http://localhost:5984
@param doc user id involved
@param dbname database name (target)
"""
def __init__(self,**args):
url = args['url']
self.uid = args['doc']
url = args['url'] if 'url' in args else 'http://localhost:5984'
self._id = args['doc']
dbname = args['dbname']
if 'username' not in args and 'password' not in args :
self.server = cloudant.CouchDB(None,None,url=url)
@ -34,9 +34,9 @@ class Couch:
#
# @TODO Check if the database exists ...
#
doc = cloudant.document.Document(self.dbase,self.uid) #self.dbase.get(self.uid)
doc = cloudant.document.Document(self.dbase,self._id) #self.dbase.get(self._id)
if not doc.exists():
doc = self.dbase.create_document({"_id":self.uid})
doc = self.dbase.create_document({"_id":self._id})
doc.save()
else:
self.dbase = None
@ -51,8 +51,8 @@ class Couch:
# At this point we are sure that the server is connected
# We are also sure that the database actually exists
#
doc = cloudant.document.Document(self.dbase,self.uid)
# q = self.dbase.all_docs(key=self.uid)['rows']
doc = cloudant.document.Document(self.dbase,self._id)
# q = self.dbase.all_docs(key=self._id)['rows']
# if not q :
if not doc.exists():
return False
@ -107,7 +107,7 @@ class CouchReader(Couch,Reader):
# # We insure the document of the given user has the requested attachment.
# #
# doc = self.dbase.get(self.uid)
# doc = self.dbase.get(self._id)
# if '_attachments' in doc:
# r = self.filename in doc['_attachments'].keys()
@ -120,8 +120,8 @@ class CouchReader(Couch,Reader):
#
# @TODO Need to get this working ...
#
document = cloudant.document.Document(self.dbase,self.uid)
# content = self.dbase.fetch_attachment(self.uid,self.filename).split('\n') ;
document = cloudant.document.Document(self.dbase,self._id)
# content = self.dbase.fetch_attachment(self._id,self.filename).split('\n') ;
content = self.get_attachment(self.filename)
for row in content:
yield row
@ -132,9 +132,9 @@ class CouchReader(Couch,Reader):
else:
return self.basic_read()
def basic_read(self):
document = cloudant.document.Document(self.dbase,self.uid)
document = cloudant.document.Document(self.dbase,self._id)
# document = self.dbase.get(self.uid)
# document = self.dbase.get(self._id)
if document.exists() :
document.fetch()
document = dict(document)
@ -157,32 +157,62 @@ class CouchWriter(Couch,Writer):
"""
Couch.__init__(self,**args)
def write(self,**params):
def set (self,info):
document = cloudand.document.Document(self.dbase,self._id)
if document.exists() :
keys = list(set(document.keys()) - set(['_id','_rev','_attachments']))
for id in keys :
document.field_set(document,id,None)
for id in args :
value = args[id]
document.field_set(document,id,value)
document.save()
pass
else:
_document = dict({"_id":self._id},**args)
document.create_document(_document)
def write(self,info):
"""
write a given attribute to a document database
@param label scope of the row repair|broken|fixed|stats
@param row row to be written
@info object to be written to the to an attribute. this
"""
# document = self.dbase.get(self.uid)
document = cloudant.document.Document(self.dbase,self.uid) #.get(self.uid)
# document = self.dbase.get(self._id)
document = cloudant.document.Document(self.dbase,self._id) #.get(self._id)
if document.exists() is False :
document = self.dbase.create_document({"_id":self.uid})
label = params['label']
row = params['row']
if label not in document :
document[label] = []
document[label].append(row)
document = self.dbase.create_document({"_id":self._id})
# label = params['label']
# row = params['row']
# if label not in document :
# document[label] = []
# document[label].append(row)
for key in info :
if key in document and type(document[key]) == list :
document[key] += info[key]
else:
document[key] = info[key]
document.save()
# self.dbase.bulk_docs([document])
# self.dbase.save_doc(document)
def upload(self,**args):
"""
:param name name of the file to be uploaded
:param data content of the file (binary or text)
:param content_type (default)
"""
mimetype = args['content_type'] if 'content_type' in args else 'text/plain'
document = cloudant.document.Document(self.dbase,self.uid)
document.put_attachment(self.dbase,args['filename'],mimetype,args['content'])
document.save()
def archive(self,params=None):
"""
This function will archive the document onto itself.
"""
# document = self.dbase.all_docs(self.uid,include_docs=True)
# document = self.dbase.all_docs(self._id,include_docs=True)
document = cloudant.document.Document(self.dbase,self.filename)
document.fetch()
content = {}
@ -196,8 +226,9 @@ class CouchWriter(Couch,Writer):
# document= _doc
now = str(datetime.today())
name = '-'.join([document['_id'] , now,'.json'])
name = '-'.join([document['_id'] , now,'.json'])
self.upload(filename=name,data=content,content_type='application/json')
# self.dbase.bulk_docs([document])
# self.dbase.put_attachment(document,content,name,'application/json')
document.put_attachment(self.dbase,name,'application/json',content)
document.save()
# document.put_attachment(self.dbase,name,'application/json',content)
# document.save()

@ -14,8 +14,8 @@ class DiskReader(Reader) :
"""
Reader.__init__(self)
self.path = params['path'] ;
self.path = params['path'] ;
self.delimiter = params['delimiter'] if 'delimiter' in params else None
def isready(self):
return os.path.exists(self.path)
def read(self,size=-1):
@ -31,55 +31,54 @@ class DiskReader(Reader) :
i += 1
if size == i:
break
if self.delimiter :
yield row.split(self.char)
yield row
f.close()
class DiskWriter(Writer):
"""
This function writes output to disk in a designated location
This function writes output to disk in a designated location. The function will write a text to a text file
- If a delimiter is provided it will use that to generate a xchar-delimited file
- If not then the object will be dumped as is
"""
def __init__(self,**params):
Writer.__init__(self)
self.cache['meta'] = {'cols':0,'rows':0,'delimiter':None}
if 'path' in params:
self.path = params['path']
else:
self.path = None
if 'name' in params:
self.name = params['name'];
else:
self.name = 'out.log'
self.path = 'data-transport.log'
self.delimiter = params['delimiter'] if 'delimiter' in params else None
# if 'name' in params:
# self.name = params['name'];
# else:
# self.name = 'data-transport.log'
# if os.path.exists(self.path) == False:
# os.mkdir(self.path)
def meta(self):
return self.cache['meta']
def isready(self):
"""
This function determines if the class is ready for execution or not
i.e it determines if the preconditions of met prior execution
"""
p = self.path is not None and os.path.exists(self.path)
q = self.name is not None
return p and q
def write(self,**params):
return True
# p = self.path is not None and os.path.exists(self.path)
# q = self.name is not None
# return p and q
def format (self,row):
self.cache['meta']['cols'] += len(row) if isinstance(row,list) else len(row.keys())
self.cache['meta']['rows'] += 1
return (self.delimiter.join(row) if self.delimiter else json.dumps(row))+"\n"
def write(self,info):
"""
This function writes a record to a designated file
@param label <passed|broken|fixed|stats>
@param row row to be written
"""
# label = params['label']
row = params['row']
# xchar = None
# if 'xchar' is not None:
# xchar = params['xchar']
#path = ''.join([self.path,os.sep,label])
# path = ''.join([self.path,os.sep,self.name])
#if os.path.exists(path) == False:
# os.mkdir(path) ;
# path = ''.join([path,os.sep,self.name])
f = open(self.path,'a')
if isinstance(row,object):
row = json.dumps(row)
#row = self.format(row,xchar);
f.write(row+"\n")
f.write(self.format(info))
f.close()

@ -4,7 +4,12 @@ Steve L. Nyemba, The Phi Technology LLC
This file is a wrapper around mongodb for reading/writing content against a mongodb server and executing views (mapreduce)
"""
from pymongo import MongoClient
from pymongo import MongoClient
from bson.objectid import ObjectId
from bson.binary import Binary
import json
from datetime import datetime
import gridfs
# from transport import Reader,Writer
import sys
if sys.version_info[0] > 2 :
@ -19,11 +24,11 @@ class Mongo :
def __init__(self,**args):
"""
:dbname database name/identifier
:host host and port of the database
:host host and port of the database by default localhost:27017
:username username for authentication
:password password for current user
"""
host = args['host']
host = args['host'] if 'host' in args else 'localhost:27017'
if 'user' in args and 'password' in args:
self.client = MongoClient(host,
@ -31,7 +36,7 @@ class Mongo :
password=args['password'] ,
authMechanism='SCRAM-SHA-256')
else:
self.client = MongoClient()
self.client = MongoClient(host)
self.uid = args['doc'] #-- document identifier
self.dbname = args['dbname']
@ -62,17 +67,67 @@ class MongoWriter(Mongo,Writer):
"""
def __init__(self,**args):
Mongo.__init__(self,**args)
def write(self,**args):
def upload(self,**args) :
"""
This function will upload a file to the current database (using GridFS)
:param data binary stream/text to be stored
:param filename filename to be used
:param encoding content_encoding (default utf-8)
"""
if 'encoding' not in args :
args['encoding'] = 'utf-8'
gfs = GridFS(self.db)
gfs.put(**args)
def archive(self):
"""
This function will archive documents to the
"""
collection = self.db[self.uid]
rows = list(collection.find())
for row in rows :
if type(row['_id']) == ObjectId :
row['_id'] = str(row['_id'])
stream = Binary(json.dumps(collection).encode())
collection.delete_many({})
now = "-".join([str(datetime.now().year()),str(datetime.now().month), str(datetime.now().day)])
name = ".".join([self.uid,'archive',now])+".json"
description = " ".join([self.uid,'archive',str(len(rows))])
self.upload(filename=name,data=stream,description=description,content_type='application/json')
# gfs = GridFS(self.db)
# gfs.put(filename=name,description=description,data=stream,encoding='utf-8')
# self.write({{"filename":name,"file":stream,"description":descriptions}})
pass
def write(self,info):
"""
This function will write to a given collection i.e add a record to a collection (no updates)
@param info new record in the collection to be added
"""
# document = self.db[self.uid].find()
collection = self.db[self.uid]
if type(args['row']) == list :
self.db[self.uid].insert_many(args['row'])
# if type(info) == list :
# self.db[self.uid].insert_many(info)
# else:
if (type(info) == list) :
self.db[self.uid].insert_many(info)
else:
self.db[self.uid].insert_one(args['row'])
self.db[self.uid].insert_one(info)
def set(self,document):
"""
if no identifier is provided the function will delete the entire collection and set the new document.
Please use this function with great care (archive the content first before using it... for safety)
"""
collection = self.db[self.uid]
if collection.count_document() > 0 :
collection.delete({_id:self.uid})
collecton.update_one({"_id":self.uid},document,True)
if collection.count_document() > 0 and '_id' in document:
id = document['_id']
del document['_id']
collection.find_one_and_replace({'_id':id},document)
else:
collection.delete_many({})
self.write(info)
# collecton.update_one({"_id":self.uid},document,True)

Loading…
Cancel
Save