diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 24fabe8..b9b48e4 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -52,6 +52,7 @@ class Learner(Process): self._encoder = None self._map = None self._df = _args['data'] if 'data' in _args else None + self.name = self.__class__.__name__ # @@ -92,10 +93,22 @@ class Learner(Process): if self._df is None : self._df = reader.read(**_read_args) columns = self.columns if self.columns else self._df.columns + # + # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases + # - The code below tries to address the issue (Perhaps better suited for the reading components) + for name in columns : + _index = np.random.choice(np.arange(self._df[name].size),5,False) + no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]] + print ([name,np.sum(no_value)]) + no_value = 0 if np.sum(no_value) > 0 else '' + + self._df[name] = self._df[name].fillna(no_value) + + # # convert the data to binary here ... - - _args = {"schema":self.get_schema(),"data":self._df,"columns":columns} + _schema = self.get_schema() + _args = {"schema":_schema,"data":self._df,"columns":columns} if self._map : _args['map'] = self._map self._encoder = prepare.Input(**_args) if self._df.shape[0] > 0 else None