|
|
@ -151,6 +151,7 @@ class Components :
|
|
|
|
if df.shape[0] and df.shape[0] :
|
|
|
|
if df.shape[0] and df.shape[0] :
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# We have a full blown matrix to be processed
|
|
|
|
# We have a full blown matrix to be processed
|
|
|
|
|
|
|
|
print ('-- Training --')
|
|
|
|
data.maker.train(**_args)
|
|
|
|
data.maker.train(**_args)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
print ("... skipping training !!")
|
|
|
|
print ("... skipping training !!")
|
|
|
@ -259,16 +260,23 @@ class Components :
|
|
|
|
_df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10])
|
|
|
|
_df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10])
|
|
|
|
#_df[name] = _df[name].dt.date
|
|
|
|
#_df[name] = _df[name].dt.date
|
|
|
|
# _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce')
|
|
|
|
# _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce')
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
_df[name] = pd.to_datetime(_df[name])
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
|
|
|
|
value = 0
|
|
|
|
if _item['type'] == 'INTEGER' :
|
|
|
|
if _item['type'] == 'INTEGER' :
|
|
|
|
_type = np.int64
|
|
|
|
_type = np.int64
|
|
|
|
elif _item['type'] in ['FLOAT','NUMERIC']:
|
|
|
|
elif _item['type'] in ['FLOAT','NUMERIC']:
|
|
|
|
_type = np.float64
|
|
|
|
_type = np.float64
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
|
|
|
|
|
|
|
|
_value = ''
|
|
|
|
_value = ''
|
|
|
|
_df[name] = _df[name].fillna(_value).astype(_type)
|
|
|
|
_df[name] = _df[name].fillna(_value) #.astype(_type)
|
|
|
|
columns.append(name)
|
|
|
|
columns.append(name)
|
|
|
|
writer.write(_df,schema=_schema,table=args['from'])
|
|
|
|
print ()
|
|
|
|
|
|
|
|
print (_df)
|
|
|
|
|
|
|
|
writer.write(_df.astype(object),schema=_schema,table=args['from'])
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
writer.write(_df,table=args['from'])
|
|
|
|
writer.write(_df,table=args['from'])
|
|
|
|
|
|
|
|
|
|
|
@ -350,7 +358,7 @@ class Components :
|
|
|
|
for _item in schema :
|
|
|
|
for _item in schema :
|
|
|
|
dtype = str
|
|
|
|
dtype = str
|
|
|
|
name = _item['name']
|
|
|
|
name = _item['name']
|
|
|
|
novalue = -1
|
|
|
|
novalue = 0
|
|
|
|
if _item['type'] in ['INTEGER','NUMERIC']:
|
|
|
|
if _item['type'] in ['INTEGER','NUMERIC']:
|
|
|
|
dtype = np.int64
|
|
|
|
dtype = np.int64
|
|
|
|
|
|
|
|
|
|
|
@ -550,7 +558,7 @@ if __name__ == '__main__' :
|
|
|
|
index = f[0] if f else 0
|
|
|
|
index = f[0] if f else 0
|
|
|
|
#
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
|
|
print ("..::: ",PIPELINE[index]['context'])
|
|
|
|
print ("..::: ",PIPELINE[index]['context'],':::..')
|
|
|
|
args = (PIPELINE[index])
|
|
|
|
args = (PIPELINE[index])
|
|
|
|
for key in _config :
|
|
|
|
for key in _config :
|
|
|
|
if key == 'pipeline' or key in args:
|
|
|
|
if key == 'pipeline' or key in args:
|
|
|
@ -567,6 +575,7 @@ if __name__ == '__main__' :
|
|
|
|
args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size'])
|
|
|
|
args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size'])
|
|
|
|
if 'dataset' not in args :
|
|
|
|
if 'dataset' not in args :
|
|
|
|
args['dataset'] = 'combined20191004v2_deid'
|
|
|
|
args['dataset'] = 'combined20191004v2_deid'
|
|
|
|
|
|
|
|
args['logs'] = args['logs'] if 'logs' in args else 'logs'
|
|
|
|
PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
|
|
|
|
PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# @TODO:
|
|
|
|
# @TODO:
|
|
|
@ -599,6 +608,7 @@ if __name__ == '__main__' :
|
|
|
|
jobs.append(job)
|
|
|
|
jobs.append(job)
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
|
|
|
|
|
|
|
|
generator = Components()
|
|
|
|
generator = Components()
|
|
|
|
generator.generate(args)
|
|
|
|
generator.generate(args)
|
|
|
|
elif 'shuffle' in SYS_ARGS :
|
|
|
|
elif 'shuffle' in SYS_ARGS :
|
|
|
|