diff --git a/data/maker/__init__.py b/data/maker/__init__.py index bf388a6..9d3bdb5 100644 --- a/data/maker/__init__.py +++ b/data/maker/__init__.py @@ -12,14 +12,14 @@ import pandas as pd import numpy as np import data.gan as gan import transport -from data.bridge import Binary +# from data.bridge import Binary import threading as thread from data.maker import prepare import copy import os import json from multiprocessing import Process, RLock - +from datetime import datetime, timedelta class ContinuousToDiscrete : ROUND_UP = 2 @@ -229,7 +229,11 @@ class Learner(Process): # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs' # sel.max_epoc def get_schema(self): - return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])] + if self.store['source']['provider'] != 'bigquery' : + return [{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])] + else: + reader = transport.factory.instance(**self.store['source']) + return reader.meta(table=self.info['from']) def initalize(self): reader = transport.factory.instance(**self.store['source']) _read_args= self.info @@ -319,21 +323,56 @@ class Generator (Learner): _iomatrix = gHandler.apply() _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix] self.post(_candidates) - def appriximate(self,_df): + def approximate(self,_df): _columns = self.info['approximate'] - _schema = {} - for _info in self.get_schema() : - _schema[_info['name']] = _info['type'] + # _schema = {} + # for _info in self.get_schema() : + # _schema[_info['name']] = _info['type'] for name in _columns : - batches = np.array_split(_df[name].values,10) + batches = np.array_split(_df[name].fillna(np.nan).values,2) + _type = np.int64 if 'int' in self.info['approximate'][name]else np.float64 x = [] for values in batches : - _values = np.random.dirichlet(values) - x += list(values + _values )if np.random.randint(0,2) else list(values - _values) - _df[name] = np.int64(x) if 'int' in _schema[name] else np.float64(x) + + index = np.where(values != '') + _values = np.random.dirichlet(values[index].astype(_type)) + values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values) + values[index] = values[index].astype(_type) + x += values.tolist() + if x : + _df[name] = x #np.array(x,dtype=np.int64) if 'int' in _type else np.arry(x,dtype=np.float64) return _df + def make_date(self,**_args) : + """ + :param year initial value + """ + if _args['year'] in ['',None,np.nan] : + return None + year = int(_args['year']) + offset = _args['offset'] if 'offset' in _args else 0 + month = np.random.randint(1,13) + if month == 2: + _end = 28 if year % 4 != 0 else 29 + else: + _end = 31 if month in [1,3,5,7,8,10,12] else 30 + day = np.random.randint(1,_end) + + #-- synthetic date + _date = datetime(year=year,month=month,day=day) + FORMAT = _args['format'] if 'format' in _args else '%Y-%m-%d' + r = [] + if offset : + r = [_date.strftime(FORMAT)] + for _delta in offset : + _date = _date + timedelta(_delta) + r.append(_date.strftime(FORMAT)) + return r + else: + return _date.strftime(FORMAT) + + pass def format(self,_df): pass def post(self,_candidates): @@ -345,10 +384,19 @@ class Generator (Learner): for _iodf in _candidates : _df = self._df.copy() _df[self.columns] = _iodf[self.columns] - if 'approximate' in self.info : - - _df = self.appriximate(_df) - writer.write(_df,schema=self.get_schema()) + if 'approximate' in self.info : + _df = self.approximate(_df) + if 'make_date' in self.info : + for name in self.info['make_date'] : + # iname = self.info['make_date']['init_field'] + iname = self.info['make_date'][name] + + years = _df[iname] + _dates = [self.make_date(year=year) for year in years] + if _dates : + _df[name] = _dates + + writer.write(_df[['birth_datetime']+self.columns],schema=self.get_schema()) pass class factory : _infocache = {}