You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
11 KiB
11 KiB
None
<html lang="en">
<head>
</head>
</html>
In [1]:
import pandas as pd
import numpy as np
from google.cloud import bigquery as bq
client = bq.Client.from_service_account_json('/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')
# pd.read_gbq(query="select * from raw.observation limit 10",private_key='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')
jobs = client.list_jobs()
for job in jobs :
# print dir(job)
print job.user_email,job.job_id,job.started, job.state
break
In [33]:
xo = ['person_id','date_of_birth','race']
xi = ['person_id','value_as_number','value_source_value']
In [10]:
def get_tables(client,id,fields=[]):
"""
getting table lists from google
"""
r = []
ref = client.dataset(id)
tables = list(client.list_tables(ref))
for table in tables :
ref = table.reference
schema = client.get_table(ref).schema
names = [f.name for f in schema]
x = list(set(names) & set(fields))
if x :
r.append({"name":table.table_id,"fields":names})
return r
def get_fields(**args):
"""
This function will generate a random set of fields from two tables. Tables are structured as follows
{name,fields:[],"y":}, with
name table name (needed to generate sql query)
fields list of field names, used in the projection
y name of the field to be joined.
@param xo candidate table in the join
@param xi candidate table in the join
@param join field by which the tables can be joined.
"""
# The set operation will remove redundancies in the field names (not sure it's a good idea)
# xo = args['xo']['fields']
# xi = args['xi']['fields']
# zi = args['xi']['name']
# return list(set([ ".".join([args['xo']['name'],name]) for name in xo]) | set(['.'.join([args['xi']['name'],name]) for name in xi if name != args['join']]) )
xo = args['xo']
fields = [".".join([args['xo']['name'],name]) for name in args['xo']['fields']]
if not isinstance(args['xi'],list) :
x_ = [args['xi']]
else:
x_ = args['xi']
for xi in x_ :
fields += (['.'.join([xi['name'], name]) for name in xi['fields'] if name != args['join']])
return fields
def generate_sql(**args):
"""
This function will generate the SQL query for the resulting join
"""
xo = args['xo']
x_ = args['xi']
xo_name = ".".join([args['prefix'],xo['name'] ]) if 'prefix' in args else xo['name']
SQL = "SELECT :fields FROM :xo.name ".replace(":xo.name",xo_name)
if not isinstance(x_,list):
x_ = [x_]
f = []#[".".join([args['xo']['name'],args['join']] )]
INNER_JOINS = []
for xi in x_ :
xi_name = ".".join([args['prefix'],xi['name'] ]) if 'prefix' in args else xi['name']
JOIN_SQL = "INNER JOIN :xi.name ON ".replace(':xi.name',xi_name)
value = ".".join([xi['name'],args['join']])
f.append(value)
ON_SQL = ""
tmp = []
for term in f :
ON_SQL = ":xi.name.:ofield = :xo.name.:ofield".replace(":xo.name",xo['name'])
ON_SQL = ON_SQL.replace(":xi.name.:ofield",term).replace(":ofield",args['join'])
tmp.append(ON_SQL)
INNER_JOINS += [JOIN_SQL + " AND ".join(tmp)]
return SQL + " ".join(INNER_JOINS)
def get_final_sql(**args):
xo = args['xo']
xi = args['xi']
join=args['join']
prefix = args['prefix'] if 'prefix' in args else ''
fields = get_fields (xo=xo,xi=xi,join=join)
k = len(fields)
n = np.random.randint(2,k) #-- number of fields to select
i = np.random.randint(0,k,size=n)
fields = [name for name in fields if fields.index(name) in i]
base_sql = generate_sql(xo=xo,xi=xi,prefix)
SQL = """
SELECT AVERAGE(count),size,n as selected_features,k as total_features
FROM(
SELECT COUNT(*) as count,count(:join) as pop,sum(:n) as N,sum(:k) as k,:fields
FROM (:sql)
GROUP BY :fields
)
order by 1
""".replace(":sql",base_sql)
# sql = "SELECT :fields FROM :xo.name INNER JOIN :xi.name ON :xi.name.:xi.y = :xo.y "
# fields = ",".join(get_fields(xo=xi,xi=xi,join=xi['y']))
# sql = sql.replace(":fields",fields).replace(":xo.name",xo['name']).replace(":xi.name",xi['name'])
# sql = sql.replace(":xi.y",xi['y']).replace(":xo.y",xo['y'])
# return sql
In [33]:
xo = {"name":"person","fields":['person_id','date_of_birth','race','value_as_number']}
xi = [{"name":"measurement","fields":['person_id','value_as_number','value_source_value']}] #,{"name":"observation","fields":["person_id","value_as_string","observation_source_value"]}]
# generate_sql(xo=xo,xi=xi,join="person_id",prefix='raw')
fields = get_fields(xo=xo,xi=xi,join='person_id')
ofields = list(fields)
k = len(fields)
n = np.random.randint(2,k) #-- number of fields to select
i = np.random.randint(0,k,size=n)
fields = [name for name in fields if fields.index(name) in i]
In [34]:
fields
Out[34]:
In [55]:
xo = {"name":"person","fields":['person_id','date_of_birth','race'],"y":"person_id"}
xi = {"name":"measurements","fields":['person_id','value_as_number','value_source_value'],"y":"person_id"}
generate_sql(xo=xo,xi=xi)
Out[55]:
In [59]:
"""
We are designing a process that will take two tables that will generate
"""
import itertools
list(itertools.combinations(['a','b','c'],2))
Out[59]:
In [6]:
#
# find every table with person id at the very least or a subset of fields
#
np.random.randint(0,4,size=4)
Out[6]:
In [90]:
list(set(['a','b']) & set(['a']))
Out[90]:
In [120]:
x_ = 1
In [10]:
x_ = pd.DataFrame({"group":[1,1,1,1,1], "size":[2,1,1,1,1]})
In [12]:
x_.groupby(['group']).mean()
Out[12]:
In [ ]: