|
|
@ -52,6 +52,7 @@ class Learner(Process):
|
|
|
|
self._encoder = None
|
|
|
|
self._encoder = None
|
|
|
|
self._map = None
|
|
|
|
self._map = None
|
|
|
|
self._df = _args['data'] if 'data' in _args else None
|
|
|
|
self._df = _args['data'] if 'data' in _args else None
|
|
|
|
|
|
|
|
|
|
|
|
self.name = self.__class__.__name__
|
|
|
|
self.name = self.__class__.__name__
|
|
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
#
|
|
|
@ -93,9 +94,21 @@ class Learner(Process):
|
|
|
|
self._df = reader.read(**_read_args)
|
|
|
|
self._df = reader.read(**_read_args)
|
|
|
|
columns = self.columns if self.columns else self._df.columns
|
|
|
|
columns = self.columns if self.columns else self._df.columns
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# convert the data to binary here ...
|
|
|
|
# Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases
|
|
|
|
|
|
|
|
# - The code below tries to address the issue (Perhaps better suited for the reading components)
|
|
|
|
|
|
|
|
for name in columns :
|
|
|
|
|
|
|
|
_index = np.random.choice(np.arange(self._df[name].size),5,False)
|
|
|
|
|
|
|
|
no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]]
|
|
|
|
|
|
|
|
print ([name,np.sum(no_value)])
|
|
|
|
|
|
|
|
no_value = 0 if np.sum(no_value) > 0 else ''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self._df[name] = self._df[name].fillna(no_value)
|
|
|
|
|
|
|
|
|
|
|
|
_args = {"schema":self.get_schema(),"data":self._df,"columns":columns}
|
|
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
|
|
# convert the data to binary here ...
|
|
|
|
|
|
|
|
_schema = self.get_schema()
|
|
|
|
|
|
|
|
_args = {"schema":_schema,"data":self._df,"columns":columns}
|
|
|
|
if self._map :
|
|
|
|
if self._map :
|
|
|
|
_args['map'] = self._map
|
|
|
|
_args['map'] = self._map
|
|
|
|
self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None
|
|
|
|
self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None
|
|
|
|