fixed issue around data-types/casting misbehavior with pandas and missing values

dev
Steve L. Nyemba 5 years ago
parent 50da909867
commit 821cec8dd7

@ -647,13 +647,8 @@ class Predict(GNet):
info['ratio'] = __ratio info['ratio'] = __ratio
info['partition'] = self.PARTITION info['partition'] = self.PARTITION
self.logger.write({"module":"gan-generate","action":"generate","input":info}) self.logger.write({"module":"gan-generate","action":"generate","input":info})
df.columns = self.values # df.columns = self.values
if len(found) or df.columns.size == len(self.values): if len(found) or df.columns.size <= len(self.values):
# print (len(found),NTH_VALID_CANDIDATE)
# x = df * self.values
#
# let's get the missing rows (if any) ...
#
ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1) ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
# print ([' **** ',ii.sum()]) # print ([' **** ',ii.sum()])
@ -669,6 +664,8 @@ class Predict(GNet):
# Log the findings here in terms of ratio, missing, candidate count # Log the findings here in terms of ratio, missing, candidate count
# print ([np.max(ratio),len(missing),len(found),i]) # print ([np.max(ratio),len(missing),len(found),i])
i = np.where(ii == 0)[0] i = np.where(ii == 0)[0]
df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1)) df = pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
df.columns = columns df.columns = columns
df = df[columns[0]].append(pd.Series(missing)) df = df[columns[0]].append(pd.Series(missing))

@ -190,7 +190,7 @@ def generate(**args):
# #
BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size']) BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value'] NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
_df = df.copy() _df = df.copy()
for col in column : for col in column :
args['context'] = col args['context'] = col
@ -237,6 +237,11 @@ def generate(**args):
_df[col] = r[col] _df[col] = r[col]
# #
# Let's cast the type to the original type (it makes the data more usable)
#
otype = df[col].dtype
_df[col] = _df[col].astype(otype)
#
# @TODO: log basic stats about the synthetic attribute # @TODO: log basic stats about the synthetic attribute
# #
# print (r)s # print (r)s

@ -195,8 +195,7 @@ class Components :
if name.endswith('_id') : if name.endswith('_id') :
if df[name].isnull().sum() > 0 : if df[name].isnull().sum() > 0 :
df[name].fillna(0,inplace=True) df[name].fillna(np.nan_to_num(np.nan),inplace=True)
else:
df[name] = df[name].astype(int) df[name] = df[name].astype(int)
@ -253,9 +252,11 @@ class Components :
print (_args['data'].head()) print (_args['data'].head())
else: else:
Components.lock.acquire() Components.lock.acquire()
data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000) data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)
INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append' INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'
print (_args['data'].dtypes)
_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000) _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
Components.lock.release() Components.lock.release()
_id = 'dataset' _id = 'dataset'

@ -4,7 +4,7 @@ import sys
def read(fname): def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read() return open(os.path.join(os.path.dirname(__file__), fname)).read()
args = {"name":"data-maker","version":"1.2.8","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT", args = {"name":"data-maker","version":"1.2.9","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]} "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo'] args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git' args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'

Loading…
Cancel
Save