@ -160,20 +160,17 @@ class Binary :
"""
# values = np.unique(column)
values = column . dropna ( ) . unique ( )
values . sort ( )
# values = column.dropna().unique()
# values.sort()
# column = column.values
values = self . get_column ( column , size )
column = column . values
#
# Let's treat the case of missing values i.e nulls
#
row_count , col_count = column . size , values . size
# if row_count * col_count > size and row_count < size:
if col_count > size :
# N = np.divide(size,row_count).astype(int)
# N =
i = np . random . choice ( col_count , size )
values = values [ - i ]
col_count = size
@ -196,7 +193,17 @@ class Binary :
return pd . DataFrame ( matrix , columns = values )
def apply ( self , column , size ) :
return self . __stream ( column , size )
def get_column_values ( self , column , size = - 1 ) :
def get_column ( self , column , size = - 1 ) :
"""
This function will return the columns that are available for processing . . .
"""
values = column . dropna ( ) . value_counts ( ) . index
if size > 0 :
values = values [ : size ]
values . sort_values ( )
return values
def _get_column_values ( self , column , size = - 1 ) :
values = column . dropna ( ) . unique ( )
values . sort ( )
@ -204,7 +211,7 @@ class Binary :
# Let's treat the case of missing values i.e nulls
#
row_count , col_count = column . size , values . size
if col_count > size :
if col_count > size and size > 0 :
# N = np.divide(size,row_count).astype(int)
# N =
i = np . random . choice ( col_count , size )
@ -270,8 +277,8 @@ if __name__ == '__main__' :
- - export will export data to a specified location
"""
df = pd . read_csv ( ' sample.csv ' )
print ( pd. get_dummies ( df . race ) )
print ( ( Binary ( ) ) . apply ( df . race , 2 ) )
print ( df. race . value_counts ( ) )
print ( ( Binary ( ) ) . apply ( df [ ' race ' ] , 3 ) )
# has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
# has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()