From 377e84daea23ad126ea787102449b4ffa09b1fd3 Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 17 May 2022 18:04:05 -0500 Subject: [PATCH] bug fix: uploading data --- data/maker/__init__.py | 72 ++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 27 deletions(-) diff --git a/data/maker/__init__.py b/data/maker/__init__.py index c8dc02a..d05509d 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -96,14 +96,17 @@ class Learner(Process): # # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases # - The code below tries to address the issue (Perhaps better suited for the reading components) + _log = {} for name in columns : _index = np.random.choice(np.arange(self._df[name].size),5,False) no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]] no_value = 0 if np.sum(no_value) > 0 else '' self._df[name] = self._df[name].fillna(no_value) - - + + _log[name] = self._df[name].dtypes.name + _log = {'action':'structure','input':_log} + self.log(**_log) # # convert the data to binary here ... _schema = self.get_schema() @@ -293,46 +296,52 @@ class Generator (Learner): name = _item['name'] if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] : - FORMAT = '%Y-%m-%d' + FORMAT = '%m-%d-%Y' - try: - # - #-- Sometimes data isn't all it's meant to be - SIZE = -1 - if 'format' in self.info and name in self.info['format'] : - FORMAT = self.info['format'][name] - SIZE = 10 - elif _item['type'] in ['DATETIME','TIMESTAMP'] : - FORMAT = '%Y-%m-%d %H:%M:%S' - SIZE = 19 + # try: + # # + # #-- Sometimes data isn't all it's meant to be + # SIZE = -1 + # if 'format' in self.info and name in self.info['format'] : + # FORMAT = self.info['format'][name] + # SIZE = 10 + # elif _item['type'] in ['DATETIME','TIMESTAMP'] : + # FORMAT = '%m-%d-%Y %H:%M:%S' + # SIZE = 19 - if SIZE > 0 : + # if SIZE > 0 : + + # values = pd.to_datetime(_df[name], format=FORMAT).astype(str) + # _df[name] = [_date[:SIZE].strip() for _date in values] - values = pd.to_datetime(_df[name], format=FORMAT).astype(str) - _df[name] = [_date[:SIZE] for _date in values] + # # _df[name] = _df[name].astype(str) + # r[name] = FORMAT + # # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') + # if _item['type'] in ['DATETIME','TIMESTAMP']: + # pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]') - r[name] = FORMAT - # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]') - if _item['type'] in ['DATETIME','TIMESTAMP']: - pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]') - - except Exception as e: - pass - finally: - pass + # except Exception as e: + # pass + # finally: + # pass else: # # Because types are inferred on the basis of the sample being processed they can sometimes be wrong # To help disambiguate we add the schema information _type = None + if 'int' in _df[name].dtypes.name or 'int' in _item['type'].lower(): _type = np.int + elif 'float' in _df[name].dtypes.name or 'float' in _item['type'].lower(): _type = np.float if _type : - _df[name] = _df[name].fillna(0).replace('',0).astype(_type) + + _df[name] = _df[name].fillna(0).replace('',0).replace('NA',0).replace('nan',0).astype(_type) + # else: + # _df[name] = _df[name].astype(str) # _df = _df.replace('NaT','').replace('NA','') if r : @@ -373,10 +382,19 @@ class Generator (Learner): _schema = self.get_schema() _schema = [{'name':_item.name,'type':_item.field_type} for _item in _schema] _df = self.format(_df,_schema) + _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ] + self.log(**{"action":"consolidate","input":_log}) + + # w = transport.factory.instance(doc='observation',provider='mongodb',context='write',db='IOV01_LOGS',auth_file='/home/steve/dev/transport/mongo.json') + # w.write(_df) + # print (_df[cols]) writer = transport.factory.instance(**_store) writer.write(_df,schema=_schema) - # _df.to_csv('foo.csv') + + + + self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}}) class Shuffle(Generator):