data-transport/transport/__init__.py

"""
Data Transport - 1.0
Steve L. Nyemba, The Phi Technology LLC

This module is designed to serve as a wrapper to a set of supported data stores :
    - couchdb
    - mongodb
    - Files (character delimited)
    - Queues (RabbmitMq)
    - Session (Flask)
    - s3
The supported operations are read/write and providing meta data to the calling code
Requirements :
	pymongo
	boto
	couldant
The configuration for the data-store is as follows :
	couchdb:
		{
			args:{
				url:<url>,
				username:<username>,
				password:<password>,
				dbname:<database>,
# 					m[xchar].append(0)
				
				
# 		#
# 		# The delimiter with the smallest variance, provided the mean is greater than 1
# 		# This would be troublesome if there many broken records sampled
# 		#
# 		m = {id: np.var(m[id]) for id in m.keys() if m[id] != [] and int(np.mean(m[id]))>1}
# 		index = m.values().index( min(m.values()))
# 		xchar = m.keys()[index]
		
# 		return xchar
# 	def col_count(self,sample):
# 		"""
# 		This function retirms the number of columns of a given sample
# 		@pre self.xchar is not None
# 		"""
		
# 		m = {}
# 		i = 0
		
# 		for row in sample:
# 			row = self.format(row)
# 			id = str(len(row))
# 			#id = str(len(row.split(self.xchar))) 
			
# 			if id not in m:
# 				m[id] = 0
# 			m[id] = m[id] + 1
		
# 		index = m.values().index( max(m.values()) )
# 		ncols = int(m.keys()[index])
		
		
# 		return ncols;
# 	def format (self,row):
# 		"""
# 			This function will clean records of a given row by removing non-ascii characters
# 			@pre self.xchar is not None
# 		"""
		
# 		if isinstance(row,list) == False:
# 			#
# 			# We've observed sometimes fields contain delimiter as a legitimate character, we need to be able to account for this and not tamper with the field values (unless necessary)
# 			cols = self.split(row)
# 			#cols = row.split(self.xchar)
# 		else:
# 			cols = row ;
# 		return [ re.sub('[^\x00-\x7F,\n,\r,\v,\b,]',' ',col.strip()).strip().replace('"','') for col in cols]
		
# 	def split (self,row):
# 		"""
# 			This function performs a split of a record and tries to attempt to preserve the integrity of the data within i.e accounting for the double quotes.
# 			@pre : self.xchar is not None
# 		""" 

# 		pattern = "".join(["(?:^|",self.xchar,")(\"(?:[^\"]+|\"\")*\"|[^",self.xchar,"]*)"])
# 		return re.findall(pattern,row.replace('\n',''))


# class Writer:
	
# 	def format(self,row,xchar):
# 		if xchar is not None and isinstance(row,list):
# 			return xchar.join(row)+'\n'
# 		elif xchar is None and isinstance(row,dict):
# 			row = json.dumps(row)
# 		return row
# 	"""
# 		It is important to be able to archive data so as to insure that growth is controlled
# 		Nothing in nature grows indefinitely neither should data being handled.
# 	"""
# 	def archive(self):
# 		pass
# 	def flush(self):
# 		pass

# class factory :
# 	@staticmethod
# 	def instance(**args):
		
# 		source = args['type']		
# 		params = args['args']
# 		anObject = None
		
# 		if source in ['HttpRequestReader','HttpSessionWriter']:
# 			#
# 			# @TODO: Make sure objects are serializable, be smart about them !!
# 			#
# 			aClassName = ''.join([source,'(**params)'])


# 		else:
			
# 			stream = json.dumps(params)
# 			aClassName = ''.join([source,'(**',stream,')'])
# 		try:
# 			anObject = eval( aClassName)
# 			#setattr(anObject,'name',source)
# 		except Exception,e:
# 			print ['Error ',e]
# 		return anObject
data transport framework for rabbitmq, mongodb, couchdb, ... 5 years ago			`"""`
			`Data Transport - 1.0`
			`Steve L. Nyemba, The Phi Technology LLC`

			`This module is designed to serve as a wrapper to a set of supported data stores :`
			`- couchdb`
			`- mongodb`
			`- Files (character delimited)`
			`- Queues (RabbmitMq)`
			`- Session (Flask)`
			`- s3`
			`The supported operations are read/write and providing meta data to the calling code`
			`Requirements :`
			`pymongo`
			`boto`
			`couldant`
			`The configuration for the data-store is as follows :`
			`couchdb:`
			`{`
			`args:{`
			`url:<url>,`
			`username:<username>,`
			`password:<password>,`
			`dbname:<database>,`
data transport framework for rabbitmq, mongodb, couchdb, ... 5 years ago			`# m[xchar].append(0)`



			`# #`
			`# # The delimiter with the smallest variance, provided the mean is greater than 1`
			`# # This would be troublesome if there many broken records sampled`
			`# #`
			`# m = {id: np.var(m[id]) for id in m.keys() if m[id] != [] and int(np.mean(m[id]))>1}`
			`# index = m.values().index( min(m.values()))`
			`# xchar = m.keys()[index]`

			`# return xchar`
			`# def col_count(self,sample):`
			`# """`
			`# This function retirms the number of columns of a given sample`
			`# @pre self.xchar is not None`
			`# """`

			`# m = {}`
			`# i = 0`

			`# for row in sample:`
			`# row = self.format(row)`
			`# id = str(len(row))`
			`# #id = str(len(row.split(self.xchar)))`

			`# if id not in m:`
			`# m[id] = 0`
			`# m[id] = m[id] + 1`

			`# index = m.values().index( max(m.values()) )`
			`# ncols = int(m.keys()[index])`


			`# return ncols;`
			`# def format (self,row):`
			`# """`
			`# This function will clean records of a given row by removing non-ascii characters`
			`# @pre self.xchar is not None`
			`# """`

			`# if isinstance(row,list) == False:`
			`# #`
			`# # We've observed sometimes fields contain delimiter as a legitimate character, we need to be able to account for this and not tamper with the field values (unless necessary)`
			`# cols = self.split(row)`
			`# #cols = row.split(self.xchar)`
			`# else:`
			`# cols = row ;`
			`# return [ re.sub('[^\x00-\x7F,\n,\r,\v,\b,]',' ',col.strip()).strip().replace('"','') for col in cols]`

			`# def split (self,row):`
			`# """`
			`# This function performs a split of a record and tries to attempt to preserve the integrity of the data within i.e accounting for the double quotes.`
			`# @pre : self.xchar is not None`
			`# """`

			`# pattern = "".join(["(?:^\|",self.xchar,")(\"(?:[^\"]+\|\"\")\"\|[^",self.xchar,"])"])`
			`# return re.findall(pattern,row.replace('\n',''))`


			`# class Writer:`

			`# def format(self,row,xchar):`
			`# if xchar is not None and isinstance(row,list):`
			`# return xchar.join(row)+'\n'`
			`# elif xchar is None and isinstance(row,dict):`
			`# row = json.dumps(row)`
			`# return row`
			`# """`
			`# It is important to be able to archive data so as to insure that growth is controlled`
			`# Nothing in nature grows indefinitely neither should data being handled.`
			`# """`
			`# def archive(self):`
			`# pass`
			`# def flush(self):`
			`# pass`

			`# class factory :`
			`# @staticmethod`
			`# def instance(**args):`

			`# source = args['type']`
			`# params = args['args']`
			`# anObject = None`

			`# if source in ['HttpRequestReader','HttpSessionWriter']:`
			`# #`
			`# # @TODO: Make sure objects are serializable, be smart about them !!`
			`# #`
			`# aClassName = ''.join([source,'(**params)'])`


			`# else:`

			`# stream = json.dumps(params)`
			`# aClassName = ''.join([source,'(**',stream,')'])`
			`# try:`
			`# anObject = eval( aClassName)`
			`# #setattr(anObject,'name',source)`
			`# except Exception,e:`
			`# print ['Error ',e]`
			`# return anObject`