bug fix: random shuffle improvements

master
Steve Nyemba 1 year ago
parent ef43f20e9c
commit b9596edd8e

@ -22,7 +22,7 @@ import nujson as json
from multiprocessing import Process, RLock from multiprocessing import Process, RLock
from datetime import datetime, timedelta from datetime import datetime, timedelta
from multiprocessing import Queue from multiprocessing import Queue
from version import __version__
import time import time
@ -179,6 +179,7 @@ class Learner(Process):
for name in columns : for name in columns :
# #
# randomly sampling 5 elements to make sense of data-types # randomly sampling 5 elements to make sense of data-types
if self._df[name].size < 5 : if self._df[name].size < 5 :
continue continue
_index = np.random.choice(np.arange(self._df[name].size),5,False) _index = np.random.choice(np.arange(self._df[name].size),5,False)
@ -552,27 +553,53 @@ class Shuffle(Generator):
""" """
def __init__(self,**_args): def __init__(self,**_args):
super().__init__(**_args) super().__init__(**_args)
if 'data' not in _args :
reader = transport.factory.instance(**self.store['source'])
self._df = reader.read(sql=self.info['sql'])
def run(self): def run(self):
np.random.seed(1) np.random.seed(1)
self.initalize() self.initalize()
#
# If we are given lists of columns instead of a list-of-list
# unpack the list
_invColumns = []
_colNames = []
_ucolNames= []
for _item in self.info['columns'] :
if type(_item) == list :
_invColumns.append(_item)
elif _item in self._df.columns.tolist():
_colNames.append(_item)
#
# At this point we build the matrix of elements we are interested in considering the any unspecified column
#
if _colNames :
_invColumns.append(_colNames)
_ucolNames = list(set(self._df.columns) - set(_colNames))
if _ucolNames :
_invColumns += [ [_name] for _name in _ucolNames]
_xdf = pd.DataFrame()
_xdf = pd.DataFrame()
_index = np.arange(self._df.shape[0]) _index = np.arange(self._df.shape[0])
for _columns in _invColumns :
_tmpdf = self._df[_columns].copy()[_columns]
np.random.shuffle(_index) np.random.shuffle(_index)
np.random.shuffle(_index)
_iocolumns = self.info['columns']
_ocolumns = list(set(self._df.columns) - set(_iocolumns) )
# _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size))
_iodf = pd.DataFrame(self._df[_iocolumns].copy(),index = np.arange(_index.size))
# self._df = self._df.loc[_index][_ocolumns].join(_iodf)
self._df = self._df.loc[_index][_ocolumns]
self._df.index = np.arange(self._df.shape[0])
self._df = self._df.join(_iodf)
#
# The following is a full shuffle
self._df = self._df.loc[_index]
self._df.index = np.arange(self._df.shape[0])
_tmpdf = _tmpdf.iloc[_index]
if _xdf.shape[0] == 0 :
_xdf = _tmpdf
else:
_xdf = _xdf.join(_tmpdf)
_xdf = _xdf[self._df.columns]
self._df = _xdf
_log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}} _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}}
self.log(**_log) self.log(**_log)
try: try:

@ -0,0 +1 @@
__version__='1.7.0'

@ -1,10 +1,10 @@
from setuptools import setup, find_packages from setuptools import setup, find_packages
import os import os
import sys import sys
import version
def read(fname): def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read() return open(os.path.join(os.path.dirname(__file__), fname)).read()
args = {"name":"data-maker","version":"1.6.8", args = {"name":"data-maker","version":version.__version__,
"author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT", "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow'] args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow']

@ -0,0 +1 @@
data/maker/version.py
Loading…
Cancel
Save