|
|
@ -22,7 +22,7 @@ import nujson as json
|
|
|
|
from multiprocessing import Process, RLock
|
|
|
|
from multiprocessing import Process, RLock
|
|
|
|
from datetime import datetime, timedelta
|
|
|
|
from datetime import datetime, timedelta
|
|
|
|
from multiprocessing import Queue
|
|
|
|
from multiprocessing import Queue
|
|
|
|
|
|
|
|
from version import __version__
|
|
|
|
import time
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -179,6 +179,7 @@ class Learner(Process):
|
|
|
|
for name in columns :
|
|
|
|
for name in columns :
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# randomly sampling 5 elements to make sense of data-types
|
|
|
|
# randomly sampling 5 elements to make sense of data-types
|
|
|
|
|
|
|
|
|
|
|
|
if self._df[name].size < 5 :
|
|
|
|
if self._df[name].size < 5 :
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
_index = np.random.choice(np.arange(self._df[name].size),5,False)
|
|
|
|
_index = np.random.choice(np.arange(self._df[name].size),5,False)
|
|
|
@ -552,27 +553,53 @@ class Shuffle(Generator):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
def __init__(self,**_args):
|
|
|
|
def __init__(self,**_args):
|
|
|
|
super().__init__(**_args)
|
|
|
|
super().__init__(**_args)
|
|
|
|
|
|
|
|
if 'data' not in _args :
|
|
|
|
|
|
|
|
reader = transport.factory.instance(**self.store['source'])
|
|
|
|
|
|
|
|
self._df = reader.read(sql=self.info['sql'])
|
|
|
|
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
def run(self):
|
|
|
|
|
|
|
|
|
|
|
|
np.random.seed(1)
|
|
|
|
np.random.seed(1)
|
|
|
|
|
|
|
|
|
|
|
|
self.initalize()
|
|
|
|
self.initalize()
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# If we are given lists of columns instead of a list-of-list
|
|
|
|
|
|
|
|
# unpack the list
|
|
|
|
|
|
|
|
_invColumns = []
|
|
|
|
|
|
|
|
_colNames = []
|
|
|
|
|
|
|
|
_ucolNames= []
|
|
|
|
|
|
|
|
for _item in self.info['columns'] :
|
|
|
|
|
|
|
|
if type(_item) == list :
|
|
|
|
|
|
|
|
_invColumns.append(_item)
|
|
|
|
|
|
|
|
elif _item in self._df.columns.tolist():
|
|
|
|
|
|
|
|
_colNames.append(_item)
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# At this point we build the matrix of elements we are interested in considering the any unspecified column
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
if _colNames :
|
|
|
|
|
|
|
|
_invColumns.append(_colNames)
|
|
|
|
|
|
|
|
_ucolNames = list(set(self._df.columns) - set(_colNames))
|
|
|
|
|
|
|
|
if _ucolNames :
|
|
|
|
|
|
|
|
_invColumns += [ [_name] for _name in _ucolNames]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_xdf = pd.DataFrame()
|
|
|
|
|
|
|
|
_xdf = pd.DataFrame()
|
|
|
|
_index = np.arange(self._df.shape[0])
|
|
|
|
_index = np.arange(self._df.shape[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for _columns in _invColumns :
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_tmpdf = self._df[_columns].copy()[_columns]
|
|
|
|
np.random.shuffle(_index)
|
|
|
|
np.random.shuffle(_index)
|
|
|
|
np.random.shuffle(_index)
|
|
|
|
|
|
|
|
_iocolumns = self.info['columns']
|
|
|
|
|
|
|
|
_ocolumns = list(set(self._df.columns) - set(_iocolumns) )
|
|
|
|
|
|
|
|
# _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size))
|
|
|
|
|
|
|
|
_iodf = pd.DataFrame(self._df[_iocolumns].copy(),index = np.arange(_index.size))
|
|
|
|
|
|
|
|
# self._df = self._df.loc[_index][_ocolumns].join(_iodf)
|
|
|
|
|
|
|
|
self._df = self._df.loc[_index][_ocolumns]
|
|
|
|
|
|
|
|
self._df.index = np.arange(self._df.shape[0])
|
|
|
|
|
|
|
|
self._df = self._df.join(_iodf)
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# The following is a full shuffle
|
|
|
|
|
|
|
|
self._df = self._df.loc[_index]
|
|
|
|
|
|
|
|
self._df.index = np.arange(self._df.shape[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_tmpdf = _tmpdf.iloc[_index]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if _xdf.shape[0] == 0 :
|
|
|
|
|
|
|
|
_xdf = _tmpdf
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
_xdf = _xdf.join(_tmpdf)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_xdf = _xdf[self._df.columns]
|
|
|
|
|
|
|
|
self._df = _xdf
|
|
|
|
_log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}}
|
|
|
|
_log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}}
|
|
|
|
self.log(**_log)
|
|
|
|
self.log(**_log)
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|