diff --git a/data/maker/__init__.py b/data/maker/__init__.py index fba1361..382c209 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -202,7 +202,7 @@ class Learner(Process): super(Learner, self).__init__() if 'gpu' in _args : - print (_args['gpu']) + os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu']) self.gpu = int(_args['gpu']) else: @@ -224,9 +224,13 @@ class Learner(Process): self._encoder = None self._map = None self._df = _args['data'] if 'data' in _args else None + self.name = self.__class__.__name__+'::'+self.info['context']+'::'+self.info['from'] # # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork # + if self.logger : + _args = {'module':self.name,'action':'init','context':self.info['context'],'gpu':(self.gpu if self.gpu is not None else -1)} + self.logger.write(_args) # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs' # sel.max_epoc @@ -249,6 +253,9 @@ class Learner(Process): if self._map : _args['map'] = self._map self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None + if self.logger : + _args = {'module':self.name,'action':'data-prep','input':{'rows':self._df.shape[0],'cols':self._df.shape[1]} } + self.logger.write(_args) class Trainer(Learner): """ This will perform training using a GAN @@ -257,10 +264,11 @@ class Trainer(Learner): super().__init__(**_args) # self.info = _args['info'] self.limit = int(_args['limit']) if 'limit' in _args else None - self.name = _args['name'] + self.autopilot = _args['autopilot'] if 'autopilot' in _args else False self.generate = None self.candidates = int(_args['candidates']) if 'candidates' in _args else 1 + def run(self): self.initalize() if self._encoder is None : @@ -277,7 +285,7 @@ class Trainer(Learner): # # At this point we have the binary matrix, we can initiate training # - + beg = datetime.now().strftime('%Y-%m-%d %H:%M:%S') gTrain = gan.Train(**_args) gTrain.apply() @@ -293,6 +301,10 @@ class Trainer(Learner): _args['gpu'] = self.gpu g = Generator(**_args) # g.run() + if self.logger : + end = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + logs = {'module':self.name,'action':'train','input':{'start':beg,'end':end}} + self.logger.write(logs) self.generate = g if self.autopilot : self.generate.run() @@ -333,29 +345,38 @@ class Generator (Learner): _args['gpu'] = self.gpu gHandler = gan.Predict(**_args) gHandler.load_meta(columns=None) - _iomatrix = gHandler.apply() + _iomatrix = gHandler.apply() _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix] + if self.logger : + _size = np.sum([len(_item) for _item in _iomatrix]) + _log = {'module':self.name,'action':'io-data','input':{'candidates':len(_candidates),'rows':_size}} + self.logger.write(_log) self.post(_candidates) def approximate(self,_df): _columns = self.info['approximate'] - # _schema = {} - # for _info in self.get_schema() : - # _schema[_info['name']] = _info['type'] - - + for name in _columns : - batches = np.array_split(_df[name].fillna(np.nan).values,2) + if _df[name].size > 100 : + BATCH_SIZE = 10 + + else: + BATCH_SIZE = 1 + batches = np.array_split(_df[name].fillna(np.nan).values,BATCH_SIZE) _type = np.int64 if 'int' in self.info['approximate'][name]else np.float64 x = [] + _log = {'module':self.name,'action':'approximate','input':{'batch':BATCH_SIZE,'col':name}} for values in batches : - - index = np.where(values != '') + + index = [ _x not in ['',None,np.nan] for _x in values] _values = np.random.dirichlet(values[index].astype(_type)) values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values) values[index] = values[index].astype(_type) x += values.tolist() - if x : + if x : + _log['input']['diff'] = 1 - np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size) _df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64) + if self.logger : + self.logger.write(_log) return _df def make_date(self,**_args) : """ @@ -402,10 +423,11 @@ class Generator (Learner): if 'table' not in _store : _store['table'] = self.info['from'] writer = transport.factory.instance(**_store) - + N = 0 for _iodf in _candidates : _df = self._df.copy() _df[self.columns] = _iodf[self.columns] + N += _df.shape[0] # #@TODO: # Improve formatting with better post-processing pipeline @@ -422,8 +444,10 @@ class Generator (Learner): _df[name] = _dates _schema = self.get_schema() _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] - writer.write(_df[self.columns],schema=_schema) - pass + + writer.write(_df,schema=_schema) + if self.logger : + self.logger.write({'module':self.name,'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) class factory : _infocache = {} @staticmethod