You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
data-transport/transport/__init__.py

210 lines
5.1 KiB
Python

"""
Data Transport - 1.0
Steve L. Nyemba, The Phi Technology LLC
This module is designed to serve as a wrapper to a set of supported data stores :
- couchdb
- mongodb
- Files (character delimited)
- Queues (RabbmitMq)
- Session (Flask)
- s3
The supported operations are read/write and providing meta data to the calling code
Requirements :
pymongo
boto
couldant
The configuration for the data-store is as follows :
couchdb:
{
args:{
url:<url>,
username:<username>,
password:<password>,
dbname:<database>,
# m[xchar].append(0)
# #
# # The delimiter with the smallest variance, provided the mean is greater than 1
# # This would be troublesome if there many broken records sampled
# #
# m = {id: np.var(m[id]) for id in m.keys() if m[id] != [] and int(np.mean(m[id]))>1}
# index = m.values().index( min(m.values()))
# xchar = m.keys()[index]
# return xchar
# def col_count(self,sample):
# """
# This function retirms the number of columns of a given sample
# @pre self.xchar is not None
# """
# m = {}
# i = 0
# for row in sample:
# row = self.format(row)
# id = str(len(row))
# #id = str(len(row.split(self.xchar)))
# if id not in m:
# m[id] = 0
# m[id] = m[id] + 1
# index = m.values().index( max(m.values()) )
# ncols = int(m.keys()[index])
# return ncols;
# def format (self,row):
# """
# This function will clean records of a given row by removing non-ascii characters
# @pre self.xchar is not None
# """
# if isinstance(row,list) == False:
# #
# # We've observed sometimes fields contain delimiter as a legitimate character, we need to be able to account for this and not tamper with the field values (unless necessary)
# cols = self.split(row)
# #cols = row.split(self.xchar)
# else:
# cols = row ;
# return [ re.sub('[^\x00-\x7F,\n,\r,\v,\b,]',' ',col.strip()).strip().replace('"','') for col in cols]
# def split (self,row):
# """
# This function performs a split of a record and tries to attempt to preserve the integrity of the data within i.e accounting for the double quotes.
# @pre : self.xchar is not None
# """
# pattern = "".join(["(?:^|",self.xchar,")(\"(?:[^\"]+|\"\")*\"|[^",self.xchar,"]*)"])
# return re.findall(pattern,row.replace('\n',''))
# class Writer:
# def format(self,row,xchar):
# if xchar is not None and isinstance(row,list):
# return xchar.join(row)+'\n'
# elif xchar is None and isinstance(row,dict):
# row = json.dumps(row)
# return row
# """
# It is important to be able to archive data so as to insure that growth is controlled
# Nothing in nature grows indefinitely neither should data being handled.
# """
# def archive(self):
# pass
# def flush(self):
# pass
# class factory :
# @staticmethod
# def instance(**args):
# source = args['type']
# params = args['args']
# anObject = None
# if source in ['HttpRequestReader','HttpSessionWriter']:
# #
# # @TODO: Make sure objects are serializable, be smart about them !!
# #
# aClassName = ''.join([source,'(**params)'])
# else:
# stream = json.dumps(params)
# aClassName = ''.join([source,'(**',stream,')'])
# try:
# anObject = eval( aClassName)
# #setattr(anObject,'name',source)
# except Exception,e:
# print ['Error ',e]
# return anObject