diff --git a/data/maker/__init__.py b/data/maker/__init__.py index e252de5..378c226 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -27,22 +27,25 @@ class ContinuousToDiscrete : values = np.array(X).astype(np.float32) BOUNDS = ContinuousToDiscrete.bounds(values,n) # _map = [{"index":BOUNDS.index(i),"ubound":i} for i in BOUNDS] - _matrix = [] - m = [] - for value in X : - x_ = np.zeros(n) + # _matrix = [] + # m = [] + # for value in X : + # x_ = np.zeros(n) - for row in BOUNDS : + # for row in BOUNDS : - if value>= row.left and value <= row.right : - index = BOUNDS.index(row) - x_[index] = 1 - break - _matrix += x_.tolist() - # - # for items in BOUNDS : - # index = BOUNDS.index(items) - return np.array(_matrix).reshape(len(X),n) + # if value>= row.left and value <= row.right : + # index = BOUNDS.index(row) + # x_[index] = 1 + # break + # _matrix += x_.tolist() + # # + # # for items in BOUNDS : + # # index = BOUNDS.index(items) + + # return np.array(_matrix).reshape(len(X),n) + matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n) + @staticmethod def bounds(x,n): @@ -65,9 +68,15 @@ class ContinuousToDiscrete : # _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE) # # # print (BOUNDS) l = {} - for value in X : - values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if value >= item.left and value <= item.right ] + for i in np.arange(len(X)): #value in X : + + value = X[i] + for item in BOUNDS : + if value >= item.left and value <= item.right : + values += [np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP)] + break + # values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if value >= item.left and value <= item.right ] # # values = [] @@ -223,11 +232,10 @@ def generate(**args): i = np.where (i == False)[0] else: i = np.where( r[col] != None)[0] - _approx = ContinuousToDiscrete.continuous(r[col][i],BIN_SIZE) + _approx = ContinuousToDiscrete.continuous(r[col][i],BIN_SIZE) #-- approximating based on arbitrary bins r[col][i] = _approx - _df[col] = r[col] #ContinuousToDiscrete.continuous(r[col],BIN_SIZE) if col in CONTINUOUS else r[col] - # _df[col] = r[col] + _df[col] = r[col] # # @TODO: log basic stats about the synthetic attribute # diff --git a/pipeline.py b/pipeline.py index 066a418..5af9550 100644 --- a/pipeline.py +++ b/pipeline.py @@ -47,7 +47,7 @@ class Components : logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) logger.write({"module":"bigquery","action":"read","input":{"sql":SQL}}) credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json') - df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').astype(object) + df = pd.read_gbq(SQL,credentials=credentials,dialect='standard') return df # return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna() diff --git a/setup.py b/setup.py index c441e36..1c8aef0 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.2.5","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.2.6","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'