|
|
@ -202,7 +202,7 @@ class Learner(Process):
|
|
|
|
|
|
|
|
|
|
|
|
super(Learner, self).__init__()
|
|
|
|
super(Learner, self).__init__()
|
|
|
|
if 'gpu' in _args :
|
|
|
|
if 'gpu' in _args :
|
|
|
|
print (_args['gpu'])
|
|
|
|
|
|
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])
|
|
|
|
os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])
|
|
|
|
self.gpu = int(_args['gpu'])
|
|
|
|
self.gpu = int(_args['gpu'])
|
|
|
|
else:
|
|
|
|
else:
|
|
|
@ -224,9 +224,13 @@ class Learner(Process):
|
|
|
|
self._encoder = None
|
|
|
|
self._encoder = None
|
|
|
|
self._map = None
|
|
|
|
self._map = None
|
|
|
|
self._df = _args['data'] if 'data' in _args else None
|
|
|
|
self._df = _args['data'] if 'data' in _args else None
|
|
|
|
|
|
|
|
self.name = self.__class__.__name__+'::'+self.info['context']+'::'+self.info['from']
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# @TODO: allow for verbose mode so we have a sens of what is going on within the newtork
|
|
|
|
# @TODO: allow for verbose mode so we have a sens of what is going on within the newtork
|
|
|
|
#
|
|
|
|
#
|
|
|
|
|
|
|
|
if self.logger :
|
|
|
|
|
|
|
|
_args = {'module':self.name,'action':'init','context':self.info['context'],'gpu':(self.gpu if self.gpu is not None else -1)}
|
|
|
|
|
|
|
|
self.logger.write(_args)
|
|
|
|
|
|
|
|
|
|
|
|
# self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
|
|
|
|
# self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
|
|
|
|
# sel.max_epoc
|
|
|
|
# sel.max_epoc
|
|
|
@ -249,6 +253,9 @@ class Learner(Process):
|
|
|
|
if self._map :
|
|
|
|
if self._map :
|
|
|
|
_args['map'] = self._map
|
|
|
|
_args['map'] = self._map
|
|
|
|
self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None
|
|
|
|
self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None
|
|
|
|
|
|
|
|
if self.logger :
|
|
|
|
|
|
|
|
_args = {'module':self.name,'action':'data-prep','input':{'rows':self._df.shape[0],'cols':self._df.shape[1]} }
|
|
|
|
|
|
|
|
self.logger.write(_args)
|
|
|
|
class Trainer(Learner):
|
|
|
|
class Trainer(Learner):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
This will perform training using a GAN
|
|
|
|
This will perform training using a GAN
|
|
|
@ -257,10 +264,11 @@ class Trainer(Learner):
|
|
|
|
super().__init__(**_args)
|
|
|
|
super().__init__(**_args)
|
|
|
|
# self.info = _args['info']
|
|
|
|
# self.info = _args['info']
|
|
|
|
self.limit = int(_args['limit']) if 'limit' in _args else None
|
|
|
|
self.limit = int(_args['limit']) if 'limit' in _args else None
|
|
|
|
self.name = _args['name']
|
|
|
|
|
|
|
|
self.autopilot = _args['autopilot'] if 'autopilot' in _args else False
|
|
|
|
self.autopilot = _args['autopilot'] if 'autopilot' in _args else False
|
|
|
|
self.generate = None
|
|
|
|
self.generate = None
|
|
|
|
self.candidates = int(_args['candidates']) if 'candidates' in _args else 1
|
|
|
|
self.candidates = int(_args['candidates']) if 'candidates' in _args else 1
|
|
|
|
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
def run(self):
|
|
|
|
self.initalize()
|
|
|
|
self.initalize()
|
|
|
|
if self._encoder is None :
|
|
|
|
if self._encoder is None :
|
|
|
@ -277,7 +285,7 @@ class Trainer(Learner):
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# At this point we have the binary matrix, we can initiate training
|
|
|
|
# At this point we have the binary matrix, we can initiate training
|
|
|
|
#
|
|
|
|
#
|
|
|
|
|
|
|
|
beg = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
gTrain = gan.Train(**_args)
|
|
|
|
gTrain = gan.Train(**_args)
|
|
|
|
gTrain.apply()
|
|
|
|
gTrain.apply()
|
|
|
|
|
|
|
|
|
|
|
@ -293,6 +301,10 @@ class Trainer(Learner):
|
|
|
|
_args['gpu'] = self.gpu
|
|
|
|
_args['gpu'] = self.gpu
|
|
|
|
g = Generator(**_args)
|
|
|
|
g = Generator(**_args)
|
|
|
|
# g.run()
|
|
|
|
# g.run()
|
|
|
|
|
|
|
|
if self.logger :
|
|
|
|
|
|
|
|
end = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
|
|
|
|
logs = {'module':self.name,'action':'train','input':{'start':beg,'end':end}}
|
|
|
|
|
|
|
|
self.logger.write(logs)
|
|
|
|
self.generate = g
|
|
|
|
self.generate = g
|
|
|
|
if self.autopilot :
|
|
|
|
if self.autopilot :
|
|
|
|
self.generate.run()
|
|
|
|
self.generate.run()
|
|
|
@ -333,29 +345,38 @@ class Generator (Learner):
|
|
|
|
_args['gpu'] = self.gpu
|
|
|
|
_args['gpu'] = self.gpu
|
|
|
|
gHandler = gan.Predict(**_args)
|
|
|
|
gHandler = gan.Predict(**_args)
|
|
|
|
gHandler.load_meta(columns=None)
|
|
|
|
gHandler.load_meta(columns=None)
|
|
|
|
_iomatrix = gHandler.apply()
|
|
|
|
_iomatrix = gHandler.apply()
|
|
|
|
_candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
|
|
|
|
_candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
|
|
|
|
|
|
|
|
if self.logger :
|
|
|
|
|
|
|
|
_size = np.sum([len(_item) for _item in _iomatrix])
|
|
|
|
|
|
|
|
_log = {'module':self.name,'action':'io-data','input':{'candidates':len(_candidates),'rows':_size}}
|
|
|
|
|
|
|
|
self.logger.write(_log)
|
|
|
|
self.post(_candidates)
|
|
|
|
self.post(_candidates)
|
|
|
|
def approximate(self,_df):
|
|
|
|
def approximate(self,_df):
|
|
|
|
_columns = self.info['approximate']
|
|
|
|
_columns = self.info['approximate']
|
|
|
|
# _schema = {}
|
|
|
|
|
|
|
|
# for _info in self.get_schema() :
|
|
|
|
|
|
|
|
# _schema[_info['name']] = _info['type']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for name in _columns :
|
|
|
|
for name in _columns :
|
|
|
|
batches = np.array_split(_df[name].fillna(np.nan).values,2)
|
|
|
|
if _df[name].size > 100 :
|
|
|
|
|
|
|
|
BATCH_SIZE = 10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
BATCH_SIZE = 1
|
|
|
|
|
|
|
|
batches = np.array_split(_df[name].fillna(np.nan).values,BATCH_SIZE)
|
|
|
|
_type = np.int64 if 'int' in self.info['approximate'][name]else np.float64
|
|
|
|
_type = np.int64 if 'int' in self.info['approximate'][name]else np.float64
|
|
|
|
x = []
|
|
|
|
x = []
|
|
|
|
|
|
|
|
_log = {'module':self.name,'action':'approximate','input':{'batch':BATCH_SIZE,'col':name}}
|
|
|
|
for values in batches :
|
|
|
|
for values in batches :
|
|
|
|
|
|
|
|
|
|
|
|
index = np.where(values != '')
|
|
|
|
index = [ _x not in ['',None,np.nan] for _x in values]
|
|
|
|
_values = np.random.dirichlet(values[index].astype(_type))
|
|
|
|
_values = np.random.dirichlet(values[index].astype(_type))
|
|
|
|
values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values)
|
|
|
|
values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values)
|
|
|
|
values[index] = values[index].astype(_type)
|
|
|
|
values[index] = values[index].astype(_type)
|
|
|
|
x += values.tolist()
|
|
|
|
x += values.tolist()
|
|
|
|
if x :
|
|
|
|
if x :
|
|
|
|
|
|
|
|
_log['input']['diff'] = 1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size)
|
|
|
|
_df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64)
|
|
|
|
_df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64)
|
|
|
|
|
|
|
|
if self.logger :
|
|
|
|
|
|
|
|
self.logger.write(_log)
|
|
|
|
return _df
|
|
|
|
return _df
|
|
|
|
def make_date(self,**_args) :
|
|
|
|
def make_date(self,**_args) :
|
|
|
|
"""
|
|
|
|
"""
|
|
|
@ -402,10 +423,11 @@ class Generator (Learner):
|
|
|
|
if 'table' not in _store :
|
|
|
|
if 'table' not in _store :
|
|
|
|
_store['table'] = self.info['from']
|
|
|
|
_store['table'] = self.info['from']
|
|
|
|
writer = transport.factory.instance(**_store)
|
|
|
|
writer = transport.factory.instance(**_store)
|
|
|
|
|
|
|
|
N = 0
|
|
|
|
for _iodf in _candidates :
|
|
|
|
for _iodf in _candidates :
|
|
|
|
_df = self._df.copy()
|
|
|
|
_df = self._df.copy()
|
|
|
|
_df[self.columns] = _iodf[self.columns]
|
|
|
|
_df[self.columns] = _iodf[self.columns]
|
|
|
|
|
|
|
|
N += _df.shape[0]
|
|
|
|
#
|
|
|
|
#
|
|
|
|
#@TODO:
|
|
|
|
#@TODO:
|
|
|
|
# Improve formatting with better post-processing pipeline
|
|
|
|
# Improve formatting with better post-processing pipeline
|
|
|
@ -422,8 +444,10 @@ class Generator (Learner):
|
|
|
|
_df[name] = _dates
|
|
|
|
_df[name] = _dates
|
|
|
|
_schema = self.get_schema()
|
|
|
|
_schema = self.get_schema()
|
|
|
|
_schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
|
|
|
|
_schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema]
|
|
|
|
writer.write(_df[self.columns],schema=_schema)
|
|
|
|
|
|
|
|
pass
|
|
|
|
writer.write(_df,schema=_schema)
|
|
|
|
|
|
|
|
if self.logger :
|
|
|
|
|
|
|
|
self.logger.write({'module':self.name,'action':'write','input':{'rows':N,'candidates':len(_candidates)}})
|
|
|
|
class factory :
|
|
|
|
class factory :
|
|
|
|
_infocache = {}
|
|
|
|
_infocache = {}
|
|
|
|
@staticmethod
|
|
|
|
@staticmethod
|
|
|
|