fix: handling outliers and missing values

dev
Steve L. Nyemba 5 years ago
parent f920ba0eda
commit 71097103da

@ -197,11 +197,20 @@ class Binary :
"""
This function will return the columns that are available for processing ...
"""
values = column.dropna().value_counts().index
values = column.dropna().value_counts().index.values
if size > 0 and column.size > size:
values = values[:size]
values.sort_values()
values.sort()
return values
def get_missing(self,column,size=-1):
values = column.dropna().value_counts().index.values
if size > 0 and column.size > size :
values = values[size:]
else:
values = np.array([])
values.sort()
return values.tolist();
def _get_column_values(self,column,size=-1):
values = column.dropna().unique()

@ -536,9 +536,10 @@ class Predict(GNet):
self.values = args['values']
self.ROW_COUNT = args['row_count']
self.oROW_COUNT = self.ROW_COUNT
self.MISSING_VALUES = np.nan_to_num(np.nan)
if 'no_value' in args and args['no_value'] not in ['na','','NA'] :
self.MISSING_VALUES = args['no_value']
# self.MISSING_VALUES = np.nan_to_num(np.nan)
# if 'no_value' in args and args['no_value'] not in ['na','','NA'] :
# self.MISSING_VALUES = args['no_value']
self.MISSING_VALUES = args['missing']
# self.MISSING_VALUES = args['no_value']
# self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value']
@ -650,15 +651,18 @@ class Predict(GNet):
# df.columns = self.values
if len(found) or df.columns.size <= len(self.values):
ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
# print ([' **** ',ii.sum()])
if ii.shape[0] > 0 :
missing = []
if ii.sum() > 0 :
#
# If the generator had a reductive effect we should be able to get random values from either :
# - The space of outliers
# - existing values for smaller spaces that have suffered over training
#
#@TODO Have this be a configurable variable
missing = np.repeat(self.MISSING_VALUES, np.where(ii==1)[0].size)
else:
missing = []
N = ii.sum()
missing_values = self.MISSING_VALUES if self.MISSING_VALUES else self.values
missing = np.random.choice(missing_values,N)
# missing = []
#
# @TODO:
# Log the findings here in terms of ratio, missing, candidate count
@ -669,6 +673,8 @@ class Predict(GNet):
df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
df.columns = columns
df = df[columns[0]].append(pd.Series(missing))
if self.logger :
info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION}
@ -680,39 +686,8 @@ class Predict(GNet):
tf.compat.v1.reset_default_graph()
df = pd.DataFrame(df)
df.columns = columns
np.random.shuffle(df[columns[0]].values)
return df.to_dict(orient='list')
# return df.to_dict(orient='list')
# count = str(len(os.listdir(self.out_dir)))
# _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])
# df.to_csv(_name,index=False)
# output.extend(np.round(f))
# for m in range(2):
# for n in range(2, self.NUM_LABELS):
# idx1 = (demo[:, m] == 1)
# idx2 = (demo[:, n] == 1)
# idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
# num = np.sum(idx)
# print ("___________________list__")
# print (idx1)
# print (idx2)
# print (idx)
# print (num)
# print ("_____________________")
# nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU))
# label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS))
# label_input[:, n] = 1
# label_input[:, m] = 1
# output = []
# for i in range(nbatch):
# f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]})
# output.extend(np.round(f))
# output = np.array(output)[:num]
# print ([m,n,output])
# np.save(self.out_dir + str(m) + str(n), output)
if __name__ == '__main__' :

@ -21,29 +21,8 @@ class ContinuousToDiscrete :
"""
This function will convert a continous stream of information into a variety a bit stream of bins
"""
# BOUNDS = np.repeat(np.divide(X.max(),n),n).cumsum().tolist()
# print ( X.values.astype(np.float32))
# print ("___________________________")
values = np.array(X).astype(np.float32)
BOUNDS = ContinuousToDiscrete.bounds(values,n)
# _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS]
# _matrix = []
# m = []
# for value in X :
# x_ = np.zeros(n)
# for row in BOUNDS :
# if value>= row.left and value <= row.right :
# index = BOUNDS.index(row)
# x_[index] = 1
# break
# _matrix += x_.tolist()
# #
# # for items in BOUNDS :
# # index = BOUNDS.index(items)
# return np.array(_matrix).reshape(len(X),n)
matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n)
@ -123,25 +102,9 @@ def train (**args) :
# @TODO : Consider performing this task on several threads/GPUs simulataneously
#
for col in column :
# args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values
# if 'float' not in df[col].dtypes.name :
# args['real'] = pd.get_dummies(df[col].fillna('')).astype(np.float32).values
# if col in CONTINUOUS:
# BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
# args['real'] = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32)
# # args['real'] = args['real'].reshape(df.shape[0],BIN_SIZE)
# else:
# df.to_csv('tmp-'+args['logs'].replace('/','_')+'-'+col+'.csv',index=False)
# print (df[col].dtypes)
# print (df[col].dropna/(axis=1).unique())
# args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values
msize = args['matrix_size'] if 'matrix_size' in args else -1
args['real'] = (Binary()).apply(df[col],msize)
context = args['context']
if 'store' in args :
args['store']['args']['doc'] = context
@ -191,38 +154,28 @@ def generate(**args):
# If the identifier is not present, we should fine a way to determine or make one
#
BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
# NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
bhandler = Binary()
_df = df.copy()
for col in column :
args['context'] = col
args['column'] = col
# if 'float' in df[col].dtypes.name or col in CONTINUOUS :
# #
# # We should create the bins for the values we are observing here
# BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
# values = ContinuousToDiscrete.continuous(df[col].values,BIN_SIZE)
# # values = np.unique(values).tolist()
# else:
# if col in CONTINUOUS :
# values = ContinuousToDiscrete.binary(df[col],BIN_SIZE).astype(np.float32).T
# else:
# values = df[col].dropna().unique().tolist()
msize = args['matrix_size'] if 'matrix_size' in args else -1
values = bhandler.get_column(df[col],msize)
MISSING= bhandler.get_missing(df[col],msize)
args['values'] = values
args['row_count'] = df.shape[0]
if col in NO_VALUE :
args['no_value'] = NO_VALUE[col]
else:
args['no_value'] = NO_VALUE
# if col in NO_VALUE :
# args['no_value'] = NO_VALUE[col]
# else:
# args['no_value'] = NO_VALUE
# novalue = NO_VALUE[col] if NO_VALUE[col] in ['na',''] else NO_VALUE[col]
# MISSING += [NO_VALUE[col]]
args['missing'] = MISSING
#
# we can determine the cardinalities here so we know what to allow or disallow
handler = gan.Predict (**args)
@ -230,22 +183,20 @@ def generate(**args):
r = handler.apply()
if col in CONTINUOUS :
r[col] = np.array(r[col])
MISSING= np.nan if args['no_value'] in ['na','','NA'] else args['no_value']
_approx = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) #-- approximating based on arbitrary bins
r[col] = _approx
if np.isnan(MISSING):
i = np.isnan(r[col])
i = np.where (i == False)[0]
else:
i = np.where( r[col] != None)[0]
_approx = ContinuousToDiscrete.continuous(r[col][i],BIN_SIZE) #-- approximating based on arbitrary bins
r[col][i] = _approx
_df[col] = r[col]
#
# Let's cast the type to the original type (it makes the data more usable)
#
# print (values)
# print ([col,df[col].dtype,_df[col].tolist()])
otype = df[col].dtype
_df[col] = _df[col].astype(otype)
#
# @TODO: log basic stats about the synthetic attribute
#

@ -9,8 +9,10 @@ if len(sys.argv) > 1:
if sys.argv[i].startswith('--'):
key = sys.argv[i][2:] #.replace('-','')
SYS_ARGS[key] = 1
if i + 1 < N:
if i + 1 < N and not sys.argv[i + 1].startswith('--'):
value = sys.argv[i + 1] = sys.argv[i+1].strip()
else:
value = None
if key and value:
SYS_ARGS[key] = value

@ -6,10 +6,13 @@ This file will perform basic tasks to finalize the GAN process by performing the
"""
import pandas as pd
import numpy as np
from multiprocessing import Process, Lock
from google.oauth2 import service_account
from google.cloud import bigquery as bq
import transport
from data.params import SYS_ARGS
import json
class Analytics :
"""
This class will compile basic analytics about a given dataset i.e compare original/synthetic
@ -33,15 +36,23 @@ class Analytics :
"""
This function will measure the distance between
"""
df = args['data']
names = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False]
pass
class Utils :
@staticmethod
def log(**args):
logger = transport.factory.instance(type="mongo.MongoWriter",args={"dbname":"aou","doc":"logs"})
logger.write(args)
logger.close()
class get :
@staticmethod
def config(**args) :
contexts = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts']
pipeline = args['pipeline']
return [ item for item in pipeline if item['context'] in contexts]
def pipeline(table,path) :
# contexts = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts']
config = json.loads((open(path)).read())
pipeline = config['pipeline']
# return [ item for item in pipeline if item['context'] in contexts]
pipeline = [item for item in pipeline if 'from' in item and item['from'].strip() == table]
Utils.log(module=table,action='init',input={"pipeline":pipeline})
return pipeline
@staticmethod
def sql(**args) :
"""
@ -54,7 +65,8 @@ class Utils :
SQL = ["SELECT * FROM :from "]
SQL_FILTER = []
NO_FILTERS_FOUND = True
pipeline = Utils.get.config(**args)
# pipeline = Utils.get.config(**args)
pipeline = args['pipeline']
REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='}
for item in pipeline :
@ -73,7 +85,7 @@ class Utils :
#
# let's pull the field schemas out of the table definition
#
Utils.log(module=args['from'],action='sql',input={"sql":" ".join(SQL) })
return " ".join(SQL).replace(":from",src)
@ -91,26 +103,36 @@ def mk(**args) :
return client.create_dataset(dataset)
return found[0]
def move (**args):
def move (args):
"""
This function will move a table from the synthetic dataset into a designated location
This is the simplest case for finalizing a synthetic data set
:private_key
"""
private_key = args['private_key']
client = bq.Client.from_service_account_json(private_key)
config = Utils.get.config(**args)
pipeline = Utils.get.pipeline(args['from'],args['config'])
_args = json.loads((open(args['config'])).read())
_args['pipeline'] = pipeline
# del _args['pipeline']
args = dict(args,**_args)
# del args['pipeline']
# private_key = args['private_key']
client = bq.Client.from_service_account_json(args['private_key'])
dataset = args['dataset']
if 'contexts' in args :
SQL = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in config]
if pipeline :
SQL = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in pipeline]
SQL += [Utils.get.sql(**args)]
SQL = ('\n UNION ALL \n'.join(SQL).replace(':dataset','io'))
else:
#
# moving a table to a designated location
tablename = args['from']
SQL = "SELECT * FROM :dataset.:table".replace(":dataset",dataset).replace(":table",tablename)
if 'sql' not in args :
SQL = "SELECT * FROM :dataset.:table"
else:
SQL = args['sql']
SQL = SQL.replace(":dataset",dataset).replace(":table",tablename)
Utils.log(module=args['from'],action='sql',input={'sql':SQL})
#
# At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table
#
@ -132,7 +154,7 @@ def move (**args):
SQL = SQL.replace("*"," , ".join(fields))
# print (SQL)
out = client.query(SQL,location='US',job_config=config)
print ()
Utils.log(module=args['from'],action='move',input={'job':out.job_id})
return (out.job_id)
@ -158,23 +180,59 @@ if __name__ == '__main__' :
Usage :
finalize --<move|stats> --contexts <c1,c2,...c3> --from <table>
"""
if 'move' in SYS_ARGS :
# table = SYS_ARGS['from']
# args = dict(config,**{"private_key":"../curation-prod.json"})
args = dict(args,**SYS_ARGS)
contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
log = []
if contexts :
args['contexts'] = contexts
log = move(**args)
if 'init' in SYS_ARGS :
dep = config['dep'] if 'dep' in config else {}
info = []
if 'queries' in dep :
info += dep['queries']
print ('________')
if 'tables' in dep :
info += dep['tables']
args = {}
jobs = []
for item in info :
args = {}
if type(item) == str :
args['from'] = item
name = item
else:
args = item
name = item['from']
args['config'] = SYS_ARGS['config']
# args['pipeline'] = []
job = Process(target=move,args=(args,))
job.name = name
jobs.append(job)
job.start()
# while len(jobs) > 0 :
# jobs = [job for job in jobs if job.is_alive()]
# time.sleep(1)
else:
tables = args['from'].split(',')
for name in tables :
name = name.strip()
args['from'] = name
log += [move(**args)]
print ("\n".join(log))
move(SYS_ARGS)
# # table = SYS_ARGS['from']
# # args = dict(config,**{"private_key":"../curation-prod.json"})
# args = dict(args,**SYS_ARGS)
# contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
# log = []
# if contexts :
# args['contexts'] = contexts
# log = move(**args)
# else:
# tables = args['from'].split(',')
# for name in tables :
# name = name.strip()
# args['from'] = name
# log += [move(**args)]
# print ("\n".join(log))

@ -14,7 +14,6 @@ from data.params import SYS_ARGS
#
# The configuration array is now loaded and we will execute the pipe line as follows
DATASET='combined20191004v2_deid'
class Components :
lock = Lock()
@ -120,37 +119,7 @@ class Components :
self.generate(args)
pass
def shuffle(self,args):
"""
"""
df = args['reader']() if 'reader' in args else args['data']
col = args['columns'][0]
distrib = df[col].value_counts()
values = np.array(distrib.index)
counts = np.array(distrib.values)
np.random.shuffle(values)
np.random.shuffle(counts)
N = len (values)
theta = np.random.sample()
pad = 0
# print (values)
iovalues = np.zeros(df.shape[0],dtype=df[col].dtype)
for i in range(N) :
# n = int(counts[i] - counts[i]*theta)
n = counts[i]
print ([counts[i],theta,n])
index = np.where(iovalues == 0)[0]
if index.size > 0 and index.size > n:
index = index[:n]
iovalues[index] = values[i]
np.random.shuffle(iovalues)
df[col] = iovalues
return df
def post(self,args):
pass
@ -177,7 +146,7 @@ class Components :
_args['gpu'] = 0
_args['num_gpu'] = 1
os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu'])
_args['no_value']= args['no_value']
# _args['no_value']= args['no_value']
_args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128
@ -207,7 +176,7 @@ class Components :
# df = pd.DataFrame(df[ int (partition) ],columns = columns)
# max_rows = int(args['partition_max_rows']) if 'partition_max_rows' in args else 1000000
# N = np.divide(df.shape[0],max_rows).astype(int) + 1
info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"space":df[args['columns'][0]].unique().size, "part_size":int(PART_SIZE)}
info = {"name":args['columns'],"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"space":df[args['columns'][0]].unique().size, "part_size":int(PART_SIZE)}
logger.write({"module":"generate","action":"partition","input":info})
_args['partition'] = int(partition)
_args['continuous']= args['continuous'] if 'continuous' in args else []
@ -400,11 +369,11 @@ if __name__ == '__main__' :
generator.generate(args)
# Components.generate(args)
elif 'shuffle' in SYS_ARGS:
args['data'] = DATA[0]
for data in DATA :
args['data'] = data
_df = (Components()).shuffle(args)
print (DATA[0][args['columns']])
print ()
print (_df[args['columns']])
else:
# DATA = np.array_split(DATA,PART_SIZE)

@ -4,7 +4,7 @@ import sys
def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()
args = {"name":"data-maker","version":"1.3.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
args = {"name":"data-maker","version":"1.3.2","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
@ -14,3 +14,5 @@ if sys.version_info[0] == 2 :
args['use_2to3_exclude_fixers'] = ['lib2to3.fixes.fix_import']
args['scripts']=['pipeline.py','finalize.py']
setup(**args)

Loading…
Cancel
Save