diff --git a/data/bridge.py b/data/bridge.py index 2e38431..137a504 100644 --- a/data/bridge.py +++ b/data/bridge.py @@ -160,20 +160,17 @@ class Binary : """ # values = np.unique(column) - values = column.dropna().unique() - values.sort() + # values = column.dropna().unique() + + # values.sort() + # column = column.values + values = self.get_column(column,size) column = column.values # # Let's treat the case of missing values i.e nulls # row_count,col_count = column.size,values.size # if row_count * col_count > size and row_count < size: - if col_count > size : - # N = np.divide(size,row_count).astype(int) - # N = - i = np.random.choice(col_count,size) - values = values[-i] - col_count = size @@ -196,7 +193,17 @@ class Binary : return pd.DataFrame(matrix,columns=values) def apply(self,column,size): return self.__stream(column,size) - def get_column_values(self,column,size=-1): + def get_column(self,column,size=-1): + """ + This function will return the columns that are available for processing ... + """ + values = column.dropna().value_counts().index + if size > 0 : + values = values[:size] + values.sort_values() + return values + + def _get_column_values(self,column,size=-1): values = column.dropna().unique() values.sort() @@ -204,7 +211,7 @@ class Binary : # Let's treat the case of missing values i.e nulls # row_count,col_count = column.size,values.size - if col_count > size : + if col_count > size and size > 0: # N = np.divide(size,row_count).astype(int) # N = i = np.random.choice(col_count,size) @@ -270,8 +277,8 @@ if __name__ == '__main__' : --export will export data to a specified location """ df = pd.read_csv('sample.csv') - print ( pd.get_dummies(df.race)) - print ( (Binary()).apply(df.race, 2)) + print ( df.race.value_counts()) + print ( (Binary()).apply(df['race'], 3)) # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys() # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys() diff --git a/data/maker/__init__.py b/data/maker/__init__.py index 527d245..26cc4de 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -136,7 +136,7 @@ def train (**args) : # print (df[col].dtypes) # print (df[col].dropna/(axis=1).unique()) # args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values - msize = args['matrix_size'] if 'matrix_size' in args else 128 + msize = args['matrix_size'] if 'matrix_size' in args else -1 args['real'] = (Binary()).apply(df[col],msize) @@ -210,8 +210,8 @@ def generate(**args): # else: # values = df[col].dropna().unique().tolist() - msize = args['matrix_size'] if 'matrix_size' in args else 128 - values = bhandler.get_column_values(df[col],msize) + msize = args['matrix_size'] if 'matrix_size' in args else -1 + values = bhandler.get_column(df[col],msize) diff --git a/setup.py b/setup.py index 44a59b1..0370cdc 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import sys def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -args = {"name":"data-maker","version":"1.3.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", +args = {"name":"data-maker","version":"1.3.1","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'