| 
						
						
							
								
							
						
						
					 | 
					 | 
					@ -7,6 +7,7 @@ import os
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					from multiprocessing import Process, Lock
 | 
					 | 
					 | 
					 | 
					from multiprocessing import Process, Lock
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					import pandas as pd
 | 
					 | 
					 | 
					 | 
					import pandas as pd
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					from google.oauth2 import service_account
 | 
					 | 
					 | 
					 | 
					from google.oauth2 import service_account
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					from google.cloud import bigquery as bq
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					import data.maker
 | 
					 | 
					 | 
					 | 
					import data.maker
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					from data.params import SYS_ARGS 
 | 
					 | 
					 | 
					 | 
					from data.params import SYS_ARGS 
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
							
								
							
						
						
							
								
							
						
						
					 | 
					 | 
					@ -115,10 +116,44 @@ class Components :
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							data.maker.train(**_args)
 | 
					 | 
					 | 
					 | 
							data.maker.train(**_args)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							
 | 
					 | 
					 | 
					 | 
							
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							if 'autopilot' in ( list(args.keys())) :
 | 
					 | 
					 | 
					 | 
							if 'autopilot' in ( list(args.keys())) :
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
								print (['drone mode enabled ....'])
 | 
					 | 
					 | 
					 | 
								print (['autopilot mode enabled ....'])
 | 
				
			
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
								self.generate(args)
 | 
					 | 
					 | 
					 | 
								self.generate(args)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							pass
 | 
					 | 
					 | 
					 | 
							pass
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
						def shuffle(self,args):
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							"""
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							"""
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							df 		= args['reader']() if 'reader' in args else args['data']
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							col  	= args['columns'][0]
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							distrib = df[col].value_counts()
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							values	= np.array(distrib.index)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							counts 	= np.array(distrib.values)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							np.random.shuffle(values)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							np.random.shuffle(counts)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							N = len (values)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							theta = np.random.sample()
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							pad = 0
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							# print (values)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							iovalues = np.zeros(df.shape[0],dtype=df[col].dtype)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							for i in range(N) :
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
								# n = int(counts[i] - counts[i]*theta)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
								n = counts[i]
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
								print ([counts[i],theta,n])
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
								index = np.where(iovalues == 0)[0]
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
								if index.size > 0 and index.size > n:
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
									index = index[:n]
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
									iovalues[index] = values[i]
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
								
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							np.random.shuffle(iovalues)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							df[col] = iovalues
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							return df
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
						def post(self,args):
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							pass
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
								
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					
 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
						# @staticmethod
 | 
					 | 
					 | 
					 | 
						# @staticmethod
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
						def generate(self,args):
 | 
					 | 
					 | 
					 | 
						def generate(self,args):
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
							
								
							
						
						
							
								
							
						
						
					 | 
					 | 
					@ -181,12 +216,12 @@ class Components :
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							#	let us fix the data types here every _id field will be an np.int64...
 | 
					 | 
					 | 
					 | 
							#	let us fix the data types here every _id field will be an np.int64...
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							#
 | 
					 | 
					 | 
					 | 
							#
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							
 | 
					 | 
					 | 
					 | 
							
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							for name in df.columns.tolist():
 | 
					 | 
					 | 
					 | 
							# for name in df.columns.tolist():
 | 
				
			
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
								
 | 
					 | 
					 | 
					 | 
								
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
								if name.endswith('_id') :
 | 
					 | 
					 | 
					 | 
							# 	if name.endswith('_id') :
 | 
				
			
			
				
				
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
									if df[name].isnull().sum() > 0 :
 | 
					 | 
					 | 
					 | 
							# 		if df[name].isnull().sum() > 0 and name not in ['unique_device_id']:
 | 
				
			
			
				
				
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
										df[name].fillna(np.nan_to_num(np.nan),inplace=True)					
 | 
					 | 
					 | 
					 | 
							# 			df[name].fillna(np.nan_to_num(np.nan),inplace=True)					
 | 
				
			
			
				
				
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
										df[name] = df[name].astype(int)
 | 
					 | 
					 | 
					 | 
							# 			df[name] = df[name].astype(int)
 | 
				
			
			
				
				
			
		
	
		
		
	
		
		
	
		
		
	
		
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							
 | 
					 | 
					 | 
					 | 
							
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							
 | 
					 | 
					 | 
					 | 
							
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							_dc = pd.DataFrame()
 | 
					 | 
					 | 
					 | 
							_dc = pd.DataFrame()
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
							
								
							
						
						
							
								
							
						
						
					 | 
					 | 
					@ -232,6 +267,11 @@ class Components :
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
								
 | 
					 | 
					 | 
					 | 
								
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
								_id = 'path'
 | 
					 | 
					 | 
					 | 
								_id = 'path'
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							else:
 | 
					 | 
					 | 
					 | 
							else:
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
								client      = bq.Client.from_service_account_json(args["private_key"])
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
								full_schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
								full_schema = [{'name':item.name,'type':item.field_type,'description':item.description} for item in full_schema]
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
								io_schema = [{'name':item['name'],'type':item['type'],'description':item['description']} for item in full_schema if item['name'] in args['columns']]
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
								credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 | 
					 | 
					 | 
					 | 
								credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
								_pname = os.sep.join([folder,table+'.csv'])
 | 
					 | 
					 | 
					 | 
								_pname = os.sep.join([folder,table+'.csv'])
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
								_fname = table.replace('_io','_full_io')
 | 
					 | 
					 | 
					 | 
								_fname = table.replace('_io','_full_io')
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
						
						
							
								
							
						
					 | 
					 | 
					@ -243,11 +283,11 @@ class Components :
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
								else:
 | 
					 | 
					 | 
					 | 
								else:
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
									Components.lock.acquire()
 | 
					 | 
					 | 
					 | 
									Components.lock.acquire()
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
									
 | 
					 | 
					 | 
					 | 
									
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
									data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)	
 | 
					 | 
					 | 
					 | 
									data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000,table_schema=io_schema)	
 | 
				
			
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
									
 | 
					 | 
					 | 
					 | 
									
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
									INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'	
 | 
					 | 
					 | 
					 | 
									INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'	
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
									
 | 
					 | 
					 | 
					 | 
									
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
									_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
 | 
					 | 
					 | 
					 | 
									_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000,table_schema=full_schema)
 | 
				
			
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
									Components.lock.release()
 | 
					 | 
					 | 
					 | 
									Components.lock.release()
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
								_id = 'dataset'
 | 
					 | 
					 | 
					 | 
								_id = 'dataset'
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
 | 
					 | 
					 | 
					 | 
							info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
							
								
							
						
						
							
								
							
						
						
					 | 
					 | 
					@ -354,7 +394,12 @@ if __name__ == '__main__' :
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							else:
 | 
					 | 
					 | 
					 | 
							else:
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
								generator.generate(args)
 | 
					 | 
					 | 
					 | 
								generator.generate(args)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							# Components.generate(args)
 | 
					 | 
					 | 
					 | 
							# Components.generate(args)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
						
 | 
					 | 
					 | 
					 | 
						elif 'shuffle' in SYS_ARGS:
 | 
				
			
			
				
				
			
		
	
		
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							args['data'] = DATA[0]
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							_df = (Components()).shuffle(args)
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							print (DATA[0][args['columns']])
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							print ()
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
					 | 
							print (_df[args['columns']])
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
						else:
 | 
					 | 
					 | 
					 | 
						else:
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							
 | 
					 | 
					 | 
					 | 
							
 | 
				
			
			
		
	
		
		
			
				
					
					 | 
					 | 
					 | 
							# DATA  = np.array_split(DATA,PART_SIZE)
 | 
					 | 
					 | 
					 | 
							# DATA  = np.array_split(DATA,PART_SIZE)
 | 
				
			
			
		
	
	
		
		
			
				
					| 
						
							
								
							
						
						
						
					 | 
					 | 
					
 
 |