|
|
|
@ -22,7 +22,7 @@ import nujson as json
|
|
|
|
|
from multiprocessing import Process, RLock
|
|
|
|
|
from datetime import datetime, timedelta
|
|
|
|
|
from multiprocessing import Queue
|
|
|
|
|
|
|
|
|
|
from version import __version__
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -33,6 +33,7 @@ class Learner(Process):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
super(Learner, self).__init__()
|
|
|
|
|
self._arch = {'init':_args}
|
|
|
|
|
self.ndx = 0
|
|
|
|
|
self._queue = Queue()
|
|
|
|
|
self.lock = RLock()
|
|
|
|
@ -44,6 +45,8 @@ class Learner(Process):
|
|
|
|
|
self.gpu = None
|
|
|
|
|
|
|
|
|
|
self.info = _args['info']
|
|
|
|
|
if 'context' not in self.info :
|
|
|
|
|
self.info['context'] = self.info['from']
|
|
|
|
|
self.columns = self.info['columns'] if 'columns' in self.info else None
|
|
|
|
|
self.store = _args['store']
|
|
|
|
|
|
|
|
|
@ -97,9 +100,12 @@ class Learner(Process):
|
|
|
|
|
# __info = (pd.DataFrame(self._states)[['name','path','args']]).to_dict(orient='records')
|
|
|
|
|
if self._states :
|
|
|
|
|
__info = {}
|
|
|
|
|
|
|
|
|
|
# print (self._states)
|
|
|
|
|
for key in self._states :
|
|
|
|
|
__info[key] = [{"name":_item['name'],"args":_item['args'],"path":_item['path']} for _item in self._states[key]]
|
|
|
|
|
_pipeline = self._states[key]
|
|
|
|
|
|
|
|
|
|
# __info[key] = ([{'name':_payload['name']} for _payload in _pipeline])
|
|
|
|
|
__info[key] = [{"name":_item['name'],"args":_item['args'],"path":_item['path']} for _item in self._states[key] if _item ]
|
|
|
|
|
self.log(object='state-space',action='load',input=__info)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -173,6 +179,7 @@ class Learner(Process):
|
|
|
|
|
for name in columns :
|
|
|
|
|
#
|
|
|
|
|
# randomly sampling 5 elements to make sense of data-types
|
|
|
|
|
|
|
|
|
|
if self._df[name].size < 5 :
|
|
|
|
|
continue
|
|
|
|
|
_index = np.random.choice(np.arange(self._df[name].size),5,False)
|
|
|
|
@ -273,15 +280,20 @@ class Trainer(Learner):
|
|
|
|
|
|
|
|
|
|
_args['network_args']['max_epochs'] = _epochs[0]['epochs']
|
|
|
|
|
self.log(action='autopilot',input={'epoch':_epochs[0]})
|
|
|
|
|
g = Generator(**_args)
|
|
|
|
|
|
|
|
|
|
# g.run()
|
|
|
|
|
|
|
|
|
|
end = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
|
_min = float((end-beg).seconds/ 60)
|
|
|
|
|
_logs = {'action':'train','input':{'start':beg.strftime('%Y-%m-%d %H:%M:%S'),'minutes':_min,"unique_counts":self._encoder._io[0]}}
|
|
|
|
|
self.log(**_logs)
|
|
|
|
|
self._g = g
|
|
|
|
|
|
|
|
|
|
if self.autopilot :
|
|
|
|
|
|
|
|
|
|
# g = Generator(**_args)
|
|
|
|
|
|
|
|
|
|
g = Generator(**self._arch['init'])
|
|
|
|
|
self._g = g
|
|
|
|
|
self._g.run()
|
|
|
|
|
#
|
|
|
|
|
#@TODO Find a way to have the data in the object ....
|
|
|
|
@ -300,10 +312,15 @@ class Generator (Learner):
|
|
|
|
|
#
|
|
|
|
|
# We need to load the mapping information for the space we are working with ...
|
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.network_args['candidates'] = int(_args['candidates']) if 'candidates' in _args else 1
|
|
|
|
|
filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json'])
|
|
|
|
|
# filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json'])
|
|
|
|
|
_suffix = self.network_args['context']
|
|
|
|
|
filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'meta-',_suffix,'.json'])
|
|
|
|
|
self.log(**{'action':'init-map','input':{'filename':filename,'exists':os.path.exists(filename)}})
|
|
|
|
|
if os.path.exists(filename):
|
|
|
|
|
|
|
|
|
|
file = open(filename)
|
|
|
|
|
self._map = json.loads(file.read())
|
|
|
|
|
file.close()
|
|
|
|
@ -485,7 +502,10 @@ class Generator (Learner):
|
|
|
|
|
N = 0
|
|
|
|
|
for _iodf in _candidates :
|
|
|
|
|
_df = self._df.copy()
|
|
|
|
|
if self.columns :
|
|
|
|
|
_df[self.columns] = _iodf[self.columns]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
N += _df.shape[0]
|
|
|
|
|
if self._states and 'post' in self._states:
|
|
|
|
|
_df = State.apply(_df,self._states['post'])
|
|
|
|
@ -533,27 +553,55 @@ class Shuffle(Generator):
|
|
|
|
|
"""
|
|
|
|
|
def __init__(self,**_args):
|
|
|
|
|
super().__init__(**_args)
|
|
|
|
|
if 'data' not in _args :
|
|
|
|
|
reader = transport.factory.instance(**self.store['source'])
|
|
|
|
|
self._df = reader.read(sql=self.info['sql'])
|
|
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
|
|
|
|
|
|
np.random.seed(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.initalize()
|
|
|
|
|
_index = np.arange(self._df.shape[0])
|
|
|
|
|
np.random.shuffle(_index)
|
|
|
|
|
np.random.shuffle(_index)
|
|
|
|
|
_iocolumns = self.info['columns']
|
|
|
|
|
_ocolumns = list(set(self._df.columns) - set(_iocolumns) )
|
|
|
|
|
# _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size))
|
|
|
|
|
_iodf = pd.DataFrame(self._df[_iocolumns].copy(),index = np.arange(_index.size))
|
|
|
|
|
# self._df = self._df.loc[_index][_ocolumns].join(_iodf)
|
|
|
|
|
self._df = self._df.loc[_index][_ocolumns]
|
|
|
|
|
self._df.index = np.arange(self._df.shape[0])
|
|
|
|
|
self._df = self._df.join(_iodf)
|
|
|
|
|
#
|
|
|
|
|
# The following is a full shuffle
|
|
|
|
|
self._df = self._df.loc[_index]
|
|
|
|
|
self._df.index = np.arange(self._df.shape[0])
|
|
|
|
|
# If we are given lists of columns instead of a list-of-list
|
|
|
|
|
# unpack the list
|
|
|
|
|
_invColumns = []
|
|
|
|
|
_colNames = []
|
|
|
|
|
_ucolNames= []
|
|
|
|
|
for _item in self.info['columns'] :
|
|
|
|
|
if type(_item) == list :
|
|
|
|
|
_invColumns.append(_item)
|
|
|
|
|
elif _item in self._df.columns.tolist():
|
|
|
|
|
_colNames.append(_item)
|
|
|
|
|
#
|
|
|
|
|
# At this point we build the matrix of elements we are interested in considering the any unspecified column
|
|
|
|
|
#
|
|
|
|
|
if _colNames :
|
|
|
|
|
_invColumns.append(_colNames)
|
|
|
|
|
_ucolNames = list(set(self._df.columns) - set(_colNames))
|
|
|
|
|
if _ucolNames :
|
|
|
|
|
_invColumns += [ [_name] for _name in _ucolNames]
|
|
|
|
|
|
|
|
|
|
_xdf = pd.DataFrame()
|
|
|
|
|
_xdf = pd.DataFrame()
|
|
|
|
|
_index = np.arange(self._df.shape[0])
|
|
|
|
|
|
|
|
|
|
for _columns in _invColumns :
|
|
|
|
|
|
|
|
|
|
_tmpdf = self._df[_columns].copy()[_columns]
|
|
|
|
|
np.random.seed(1)
|
|
|
|
|
np.random.shuffle(_index)
|
|
|
|
|
print (_columns,_index)
|
|
|
|
|
# _values = _tmpdf.values[_index]
|
|
|
|
|
#_tmpdf = _tmpdf.iloc[_index]
|
|
|
|
|
_tmpdf = pd.DataFrame(_tmpdf.values[_index],columns=_columns)
|
|
|
|
|
if _xdf.shape[0] == 0 :
|
|
|
|
|
_xdf = _tmpdf
|
|
|
|
|
else:
|
|
|
|
|
_xdf = _xdf.join(_tmpdf)
|
|
|
|
|
|
|
|
|
|
_xdf = _xdf[self._df.columns]
|
|
|
|
|
self._df = _xdf
|
|
|
|
|
_log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}}
|
|
|
|
|
self.log(**_log)
|
|
|
|
|
try:
|
|
|
|
@ -580,6 +628,7 @@ class factory :
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
if _args['apply'] in [apply.RANDOM] :
|
|
|
|
|
pthread = Shuffle(**_args)
|
|
|
|
|