|
|
@ -139,19 +139,23 @@ class risk :
|
|
|
|
fields = list(set(table['fields']) - set([key]))
|
|
|
|
fields = list(set(table['fields']) - set([key]))
|
|
|
|
#-- We need to select n-fields max 64
|
|
|
|
#-- We need to select n-fields max 64
|
|
|
|
k = len(fields)
|
|
|
|
k = len(fields)
|
|
|
|
n = np.random.randint(2,24) #-- how many random fields are we processing
|
|
|
|
n = np.random.randint(2,64) #-- how many random fields are we processing
|
|
|
|
ii = np.random.choice(k,n,replace=False)
|
|
|
|
ii = np.random.choice(k,n,replace=False)
|
|
|
|
fields = list(np.array(fields)[ii])
|
|
|
|
stream = np.zeros(len(fields) + 1)
|
|
|
|
|
|
|
|
stream[ii] = 1
|
|
|
|
|
|
|
|
stream = pd.DataFrame(stream.tolist()).T
|
|
|
|
|
|
|
|
stream.columns = args['table']['fields']
|
|
|
|
|
|
|
|
fields = list(np.array(fields)[ii])
|
|
|
|
|
|
|
|
|
|
|
|
sql = """
|
|
|
|
sql = """
|
|
|
|
SELECT COUNT(g_size) as group_count, SUM(g_size) as patient_count, COUNT(g_size)/SUM(g_size) as marketer, 1/ MIN(g_size) as prosecutor
|
|
|
|
SELECT COUNT(g_size) as group_count, COUNT( DISTINCT :key) as patient_count,SUM(g_size) as rec_count, COUNT(g_size)/SUM(g_size) as marketer, 1/ MIN(g_size) as prosecutor, :n as field_count
|
|
|
|
FROM (
|
|
|
|
FROM (
|
|
|
|
SELECT COUNT(*) as g_size,:key,:fields
|
|
|
|
SELECT COUNT(*) as g_size,:key,:fields
|
|
|
|
FROM :full_name
|
|
|
|
FROM :full_name
|
|
|
|
GROUP BY :key,:fields
|
|
|
|
GROUP BY :key,:fields
|
|
|
|
)
|
|
|
|
)
|
|
|
|
""".replace(":fields", ",".join(fields)).replace(":full_name",table['full_name']).replace(":key",key).replace(":n",str(n))
|
|
|
|
""".replace(":fields", ",".join(fields)).replace(":full_name",table['full_name']).replace(":key",key).replace(":n",str(n))
|
|
|
|
return sql
|
|
|
|
return {"sql":sql,"stream":stream}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -195,14 +199,19 @@ if 'action' in SYS_ARGS and SYS_ARGS['action'] in ['create','compute'] :
|
|
|
|
#
|
|
|
|
#
|
|
|
|
#
|
|
|
|
#
|
|
|
|
tables = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ]
|
|
|
|
tables = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ]
|
|
|
|
|
|
|
|
limit = int(SYS_ARGS['limit']) if 'limit' in SYS_ARGS else 1
|
|
|
|
if tables :
|
|
|
|
if tables :
|
|
|
|
risk = risk()
|
|
|
|
risk = risk()
|
|
|
|
df = pd.DataFrame()
|
|
|
|
df = pd.DataFrame()
|
|
|
|
for i in range(0,10) :
|
|
|
|
dfs = pd.DataFrame()
|
|
|
|
sql = risk.get_sql(key=SYS_ARGS['key'],table=tables[0])
|
|
|
|
for i in range(0,limit) :
|
|
|
|
|
|
|
|
r = risk.get_sql(key=SYS_ARGS['key'],table=tables[0])
|
|
|
|
|
|
|
|
sql = r['sql']
|
|
|
|
|
|
|
|
dfs = dfs.append(r['stream'])
|
|
|
|
df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard'))
|
|
|
|
df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard'))
|
|
|
|
df.to_csv(SYS_ARGS['table']+'.csv')
|
|
|
|
df.to_csv(SYS_ARGS['table']+'.csv')
|
|
|
|
print [i,' ** ',df.shape[0]]
|
|
|
|
dfs.to_csv(SYS_ARGS['table']+'_stream.csv')
|
|
|
|
|
|
|
|
print [i,' ** ',df.shape[0],pd.DataFrame(r['stream']).shape]
|
|
|
|
time.sleep(2)
|
|
|
|
time.sleep(2)
|
|
|
|
|
|
|
|
|
|
|
|
pass
|
|
|
|
pass
|
|
|
@ -223,4 +232,4 @@ else:
|
|
|
|
# p = r.compute()
|
|
|
|
# p = r.compute()
|
|
|
|
# print p
|
|
|
|
# print p
|
|
|
|
# p.to_csv("risk.csv")
|
|
|
|
# p.to_csv("risk.csv")
|
|
|
|
# r.write('foo.sql')
|
|
|
|
# r.write('foo.sql')
|
|
|
|