diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 09bdb4c..8327eea 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -22,7 +22,7 @@ import nujson as json from multiprocessing import Process, RLock from datetime import datetime, timedelta from multiprocessing import Queue - +from version import __version__ import time @@ -179,6 +179,7 @@ class Learner(Process): for name in columns : # # randomly sampling 5 elements to make sense of data-types + if self._df[name].size < 5 : continue _index = np.random.choice(np.arange(self._df[name].size),5,False) @@ -552,27 +553,53 @@ class Shuffle(Generator): """ def __init__(self,**_args): super().__init__(**_args) + if 'data' not in _args : + reader = transport.factory.instance(**self.store['source']) + self._df = reader.read(sql=self.info['sql']) + def run(self): np.random.seed(1) + self.initalize() - _index = np.arange(self._df.shape[0]) - np.random.shuffle(_index) - np.random.shuffle(_index) - _iocolumns = self.info['columns'] - _ocolumns = list(set(self._df.columns) - set(_iocolumns) ) - # _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size)) - _iodf = pd.DataFrame(self._df[_iocolumns].copy(),index = np.arange(_index.size)) - # self._df = self._df.loc[_index][_ocolumns].join(_iodf) - self._df = self._df.loc[_index][_ocolumns] - self._df.index = np.arange(self._df.shape[0]) - self._df = self._df.join(_iodf) # - # The following is a full shuffle - self._df = self._df.loc[_index] - self._df.index = np.arange(self._df.shape[0]) - + # If we are given lists of columns instead of a list-of-list + # unpack the list + _invColumns = [] + _colNames = [] + _ucolNames= [] + for _item in self.info['columns'] : + if type(_item) == list : + _invColumns.append(_item) + elif _item in self._df.columns.tolist(): + _colNames.append(_item) + # + # At this point we build the matrix of elements we are interested in considering the any unspecified column + # + if _colNames : + _invColumns.append(_colNames) + _ucolNames = list(set(self._df.columns) - set(_colNames)) + if _ucolNames : + _invColumns += [ [_name] for _name in _ucolNames] + + _xdf = pd.DataFrame() + _xdf = pd.DataFrame() + _index = np.arange(self._df.shape[0]) + for _columns in _invColumns : + + _tmpdf = self._df[_columns].copy()[_columns] + np.random.shuffle(_index) + + _tmpdf = _tmpdf.iloc[_index] + + if _xdf.shape[0] == 0 : + _xdf = _tmpdf + else: + _xdf = _xdf.join(_tmpdf) + + _xdf = _xdf[self._df.columns] + self._df = _xdf _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}} self.log(**_log) try: diff --git a/data/maker/version.py b/data/maker/version.py new file mode 100644 index 0000000..6e0eb49 --- /dev/null +++ b/data/maker/version.py @@ -0,0 +1 @@ +__version__='1.7.0' diff --git a/setup.py b/setup.py index 505dae7..a30ac52 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,10 @@ from setuptools import setup, find_packages import os import sys - +import version def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.6.8", +args = {"name":"data-maker","version":version.__version__, "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow'] diff --git a/version.py b/version.py new file mode 120000 index 0000000..85fd196 --- /dev/null +++ b/version.py @@ -0,0 +1 @@ +data/maker/version.py \ No newline at end of file