not sure about the changes (oops)

dev
Steve L. Nyemba 5 years ago
parent 63a7f1a968
commit 31ca5886f0

@ -43,6 +43,10 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
class void : class void :
pass pass
class GNet : class GNet :
def log(self,**args):
self.logs = dict(args,**self.logs)
""" """
This is the base class of a generative network functions, the details will be implemented in the subclasses. This is the base class of a generative network functions, the details will be implemented in the subclasses.
An instance of this class is accessed as follows An instance of this class is accessed as follows
@ -52,7 +56,7 @@ class GNet :
def __init__(self,**args): def __init__(self,**args):
self.layers = void() self.layers = void()
self.layers.normalize = self.normalize self.layers.normalize = self.normalize
self.logs = {}
self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu'] self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
@ -95,6 +99,15 @@ class GNet :
self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT]) self.train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT])
self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT]) self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
if self.logger :
#
# We will clear the logs from the data-store
#
column = self.ATTRIBUTES['synthetic']
db = self.logger.db
if db[column].count() > 0 :
db.backup.insert({'name':column,'logs':list(db[column].find()) })
db[column].drop()
def load_meta(self,column): def load_meta(self,column):
""" """
@ -114,7 +127,9 @@ class GNet :
def log_meta(self,**args) : def log_meta(self,**args) :
_object = { _object = {
'_id':'meta',
'CONTEXT':self.CONTEXT, 'CONTEXT':self.CONTEXT,
'ATTRIBUTES':self.ATTRIBUTES, 'ATTRIBUTES':self.ATTRIBUTES,
'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU, 'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU,
@ -314,6 +329,11 @@ class Train (GNet):
# print ([" *** ",self.BATCHSIZE_PER_GPU]) # print ([" *** ",self.BATCHSIZE_PER_GPU])
self.meta = self.log_meta() self.meta = self.log_meta()
if(self.logger):
self.logger.write( row=self.meta )
self.log (real_shape=list(self._REAL.shape),label_shape = list(self._LABEL.shape),meta_data=self.meta)
def load_meta(self, column): def load_meta(self, column):
""" """
This function will delegate the calls to load meta data to it's dependents This function will delegate the calls to load meta data to it's dependents
@ -350,11 +370,14 @@ class Train (GNet):
if stage == 'D': if stage == 'D':
w, loss = self.discriminator.loss(real=real, fake=fake, label=label) w, loss = self.discriminator.loss(real=real, fake=fake, label=label)
#losses = tf.get_collection('dlosses', scope) #losses = tf.get_collection('dlosses', scope)
flag = 'dlosses'
losses = tf.compat.v1.get_collection('dlosses', scope) losses = tf.compat.v1.get_collection('dlosses', scope)
else: else:
w, loss = self.generator.loss(fake=fake, label=label) w, loss = self.generator.loss(fake=fake, label=label)
#losses = tf.get_collection('glosses', scope) #losses = tf.get_collection('glosses', scope)
flag = 'glosses'
losses = tf.compat.v1.get_collection('glosses', scope) losses = tf.compat.v1.get_collection('glosses', scope)
# losses = tf.compat.v1.get_collection(flag, scope)
total_loss = tf.add_n(losses, name='total_loss') total_loss = tf.add_n(losses, name='total_loss')
@ -369,7 +392,8 @@ class Train (GNet):
dataset = dataset.repeat(10000) dataset = dataset.repeat(10000)
dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU) dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
dataset = dataset.prefetch(1) dataset = dataset.prefetch(1)
iterator = dataset.make_initializable_iterator() # iterator = dataset.make_initializable_iterator()
iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
# next_element = iterator.get_next() # next_element = iterator.get_next()
# init_op = iterator.initializer # init_op = iterator.initializer
return iterator, features_placeholder, labels_placeholder return iterator, features_placeholder, labels_placeholder
@ -406,6 +430,9 @@ class Train (GNet):
# max_epochs = args['max_epochs'] if 'max_epochs' in args else 10 # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10
REAL = self._REAL REAL = self._REAL
LABEL= self._LABEL LABEL= self._LABEL
if (self.logger):
pass
with tf.device('/cpu:0'): with tf.device('/cpu:0'):
opt_d = tf.compat.v1.train.AdamOptimizer(1e-4) opt_d = tf.compat.v1.train.AdamOptimizer(1e-4)
opt_g = tf.compat.v1.train.AdamOptimizer(1e-4) opt_g = tf.compat.v1.train.AdamOptimizer(1e-4)
@ -441,7 +468,7 @@ class Train (GNet):
print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration)) print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
# print (dir (w_distance)) # print (dir (w_distance))
logs.append({"epoch":epoch,"distance":-w_sum }) logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })
if epoch % self.MAX_EPOCHS == 0: if epoch % self.MAX_EPOCHS == 0:
# suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic'] # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
@ -453,8 +480,13 @@ class Train (GNet):
# #
if self.logger : if self.logger :
row = {"logs":logs} #,"model":pickle.dump(sess)} row = {"logs":logs} #,"model":pickle.dump(sess)}
self.logger.write(row=row) self.logger.write(row=row)
#
# @TODO:
# We should upload the files in the checkpoint
# This would allow the learnt model to be portable to another system
#
tf.compat.v1.reset_default_graph()
class Predict(GNet): class Predict(GNet):
""" """
@ -482,35 +514,58 @@ class Predict(GNet):
fake = self.generator.network(inputs=z, label=label) fake = self.generator.network(inputs=z, label=label)
init = tf.compat.v1.global_variables_initializer() init = tf.compat.v1.global_variables_initializer()
saver = tf.compat.v1.train.Saver() saver = tf.compat.v1.train.Saver()
df = pd.DataFrame()
CANDIDATE_COUNT = 1000
NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0]
with tf.compat.v1.Session() as sess: with tf.compat.v1.Session() as sess:
# sess.run(init) # sess.run(init)
saver.restore(sess, model_dir) saver.restore(sess, model_dir)
labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) ) labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
found = []
labels= demo labels= demo
for i in np.arange(CANDIDATE_COUNT) :
f = sess.run(fake,feed_dict={y:labels}) f = sess.run(fake,feed_dict={y:labels})
# #
# if we are dealing with numeric values only we can perform a simple marginal sum against the indexes # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
# The code below will insure we have some acceptable cardinal relationships between id and synthetic values
# #
df = ( pd.DataFrame(np.round(f).astype(np.int32))) df = ( pd.DataFrame(np.round(f).astype(np.int32)))
p = 0 not in df.sum(axis=1).values
if p:
found.append(df)
if len(found) == NTH_VALID_CANDIDATE or i == CANDIDATE_COUNT:
break
else:
continue
# i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
# df = (i * df).sum(axis=1) # df = (i * df).sum(axis=1)
# #
# In case we are dealing with actual values like diagnosis codes we can perform # In case we are dealing with actual values like diagnosis codes we can perform
# #
df = found[np.random.choice(np.arange(len(found)),1)[0]]
columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']] columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
r = np.zeros((self.ROW_COUNT,len(columns))) # r = np.zeros((self.ROW_COUNT,len(columns)))
for col in df : r = np.zeros(self.ROW_COUNT)
i = np.where(df[col])[0] df.columns = self.values
r[i] = col if len(found):
print (len(found),NTH_VALID_CANDIDATE)
# x = df * self.values
df = pd.DataFrame( df.apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
df.columns = columns
df = pd.DataFrame(r,columns=columns)
df[df.columns] = (df.apply(lambda value: self.values[ int(value)],axis=1))
return df.to_dict(orient='lists') tf.compat.v1.reset_default_graph()
return df.to_dict(orient='list')
# return df.to_dict(orient='list') # return df.to_dict(orient='list')
# count = str(len(os.listdir(self.out_dir))) # count = str(len(os.listdir(self.out_dir)))
# _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv']) # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])

@ -12,6 +12,7 @@ import pandas as pd
import numpy as np import numpy as np
import data.gan as gan import data.gan as gan
from transport import factory from transport import factory
import threading as thread
def train (**args) : def train (**args) :
""" """
This function is intended to train the GAN in order to learn about the distribution of the features This function is intended to train the GAN in order to learn about the distribution of the features
@ -21,19 +22,22 @@ def train (**args) :
:data data-frame to be synthesized :data data-frame to be synthesized
:context label of what we are synthesizing :context label of what we are synthesizing
""" """
column = args['column'] column = args['column'] if (isinstance(args['column'],list)) else [args['column']]
column_id = args['id'] column_id = args['id']
df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
# logs = args['logs'] df.columns = [name.lower() for name in df.columns]
# real = pd.get_dummies(df[column]).astype(np.float32).values
# labels = pd.get_dummies(df[column_id]).astype(np.float32).values #
args['real'] = pd.get_dummies(df[column]).astype(np.float32).values # If we have several columns we will proceed one at a time (it could be done in separate threads)
# @TODO : Consider performing this task on several threads/GPUs simulataneously
#
args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values
# num_gpu = 1 if 'num_gpu' not in args else args['num_gpu'] for col in column :
# max_epochs = 10 if 'max_epochs' not in args else args['max_epochs'] args['real'] = pd.get_dummies(df[col]).astype(np.float32).values
args['column'] = col
args['context'] = col
context = args['context'] context = args['context']
if 'store' in args : if 'store' in args :
args['store']['args']['doc'] = context args['store']['args']['doc'] = context
logger = factory.instance(**args['store']) logger = factory.instance(**args['store'])
@ -42,9 +46,18 @@ def train (**args) :
else: else:
logger = None logger = None
trainer = gan.Train(**args) trainer = gan.Train(**args)
# trainer = gan.Train(context=context,max_epochs=max_epochs,num_gpu=num_gpu,real=real,label=labels,column=column,column_id=column_id,logger = logger,logs=logs) trainer.apply()
return trainer.apply() def post(**args):
"""
This uploads the tensorflow checkpoint to a data-store (mongodb, biguqery, s3)
"""
pass
def get(**):
"""
This function will restore a checkpoint from a persistant storage on to disk
"""
pass
def generate(**args): def generate(**args):
""" """
This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
@ -57,29 +70,27 @@ def generate(**args):
""" """
# df = args['data'] # df = args['data']
df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
column = args['column']
column_id = args['id']
# logs = args['logs']
# context = args['context']
# num_gpu = 1 if 'num_gpu' not in args else args['num_gpu']
# max_epochs = 10 if 'max_epochs' not in args else args['max_epochs']
column = args['column'] if (isinstance(args['column'],list)) else [args['column']]
column_id = args['id']
# #
#@TODO: #@TODO:
# If the identifier is not present, we should fine a way to determine or make one # If the identifier is not present, we should fine a way to determine or make one
# #
#ocolumns= list(set(df.columns.tolist())- set(columns))
values = df[column].unique().tolist()
values.sort()
# labels = pd.get_dummies(df[column_id]).astype(np.float32).values
args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values
_df = df.copy()
for col in column :
args['context'] = col
args['column'] = col
values = df[col].unique().tolist()
# values.sort()
args['values'] = values args['values'] = values
# handler = gan.Predict (context=context,label=labels,max_epochs=max_epochs,num_gpu=num_gpu,values=values,column=column,logs=logs) #
# we can determine the cardinalities here so we know what to allow or disallow
handler = gan.Predict (**args) handler = gan.Predict (**args)
handler.load_meta(column) handler.load_meta(col)
r = handler.apply() r = handler.apply()
_df = df.copy() # print (r)
_df[column] = r[column] _df[col] = r[col]
# break
return _df return _df

@ -15,13 +15,15 @@ if 'config' in SYS_ARGS :
_df = data.maker.generate(**ARGS) _df = data.maker.generate(**ARGS)
odf = pd.read_csv (ARGS['data']) odf = pd.read_csv (ARGS['data'])
odf.columns = [name.lower() for name in odf.columns] odf.columns = [name.lower() for name in odf.columns]
column = [ARGS['column'] ] #+ ARGS['id'] column = ARGS['column'] if isinstance(ARGS['column'],list) else [ARGS['column']]
print (column) print(pd.merge(odf,_df, on='id'))
print (_df[column].risk.evaluate()) # print (_df[column].risk.evaluate(flag='synth'))
print (odf[column].risk.evaluate()) # print (odf[column].risk.evaluate(flag='original'))
_x = pd.get_dummies(_df[column]).values # _x = pd.get_dummies(_df[column]).values
y = pd.get_dummies(odf[column]).values # y = pd.get_dummies(odf[column]).values
N = _df.shape[0] # N = _df.shape[0]
print (np.mean([ wd(_x[i],y[i])for i in range(0,N)])) # print (np.mean([ wd(_x[i],y[i])for i in range(0,N)]))
# print (wd(_x[0],y[0]) )
# column = SYS_ARGS['column'] # column = SYS_ARGS['column']
# odf = open(SYS_ARGS['data']) # odf = open(SYS_ARGS['data'])
Loading…
Cancel
Save