You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
data-maker/pipeline.py

676 lines
22 KiB
Python

#!/usr/bin/env python3
5 years ago
@staticmethod
df = pd.read_gbq(SQL,credentials=credentials,dialect='standard')
else :
4 years ago
def shuffle(self,_args):
for _item in _schema :
if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
_df[_item['name']] = _df[_item['name']].astype(str)
writer.write(_df,schema=_schema,table=args['from'])
else:
writer.write(_df,table=args['from'])
_cast = {}
4 years ago
args['data'] = args['data'][ list(set(df.columns)- set(_cols))]
# we need to format the fields here to make sure we have something cohesive
if set(df.columns) & set(_df.columns) :
4 years ago
# info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
# if partition :
# info ['partition'] = int(partition)
# logger.write({"module":"generate","action":"write","input":info} )
args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size'])
job.name = 'Trainer # ' + str(index)
time.sleep(2)