You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

11 KiB

None <html lang="en"> <head> </head>
In [1]:
import pandas as pd
import numpy as np
from google.cloud import bigquery as bq

client = bq.Client.from_service_account_json('/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')
# pd.read_gbq(query="select * from raw.observation limit 10",private_key='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')
jobs = client.list_jobs()
for job in jobs :
#     print dir(job)
    print job.user_email,job.job_id,job.started, job.state
    break
dev-deid-600@aou-res-deid-vumc-test.iam.gserviceaccount.com df0ac049-d5b6-416f-ab3c-6321eda919d6 2018-09-25 08:18:34.829000+00:00 DONE
In [33]:
xo = ['person_id','date_of_birth','race']
xi = ['person_id','value_as_number','value_source_value']
In [10]:
def get_tables(client,id,fields=[]):
    """
        getting table lists from google
    """
    r = []
    ref = client.dataset(id)
    tables = list(client.list_tables(ref))
    for table in tables :
        ref = table.reference
        schema = client.get_table(ref).schema
        names = [f.name for f in schema]
        x = list(set(names) & set(fields))
        if x  :
            r.append({"name":table.table_id,"fields":names})
    return r
    
def get_fields(**args):
    """
        This function will generate a random set of fields from two tables. Tables are structured as follows 
        {name,fields:[],"y":}, with 
            name     table name (needed to generate sql query)
            fields   list of field names, used in the projection
            y        name of the field to be joined.
        @param xo candidate table in the join
        @param xi candidate table in the join
        @param join field by which the tables can be joined.
    """
    # The set operation will remove redundancies in the field names (not sure it's a good idea)
#     xo = args['xo']['fields']
#     xi = args['xi']['fields']
#     zi = args['xi']['name']
#     return list(set([ ".".join([args['xo']['name'],name]) for name in xo]) | set(['.'.join([args['xi']['name'],name]) for name in xi if name != args['join']]) )
    xo = args['xo']
    fields = [".".join([args['xo']['name'],name]) for name in args['xo']['fields']]
    if not isinstance(args['xi'],list) :
        x_ = [args['xi']]
    else:
        x_ = args['xi']
    for xi in x_ :
        fields += (['.'.join([xi['name'], name]) for name in xi['fields'] if name != args['join']])
    return fields
def generate_sql(**args):
    """
        This function will generate the SQL query for the resulting join
    """
    
    xo = args['xo']
    x_ = args['xi']
    xo_name = ".".join([args['prefix'],xo['name'] ]) if 'prefix' in args else xo['name']
    SQL = "SELECT :fields FROM :xo.name ".replace(":xo.name",xo_name)
    if not isinstance(x_,list):
        x_ = [x_]
    f = []#[".".join([args['xo']['name'],args['join']] )]    
    INNER_JOINS = []
    for xi in x_ :
        xi_name = ".".join([args['prefix'],xi['name'] ]) if 'prefix' in args else xi['name']
        JOIN_SQL = "INNER JOIN :xi.name ON ".replace(':xi.name',xi_name)
        value  = ".".join([xi['name'],args['join']])
        f.append(value) 
        
        ON_SQL = ""
        tmp = []
        for term in f :
            ON_SQL = ":xi.name.:ofield = :xo.name.:ofield".replace(":xo.name",xo['name'])
            ON_SQL = ON_SQL.replace(":xi.name.:ofield",term).replace(":ofield",args['join'])
            tmp.append(ON_SQL)
        INNER_JOINS += [JOIN_SQL + " AND ".join(tmp)]
    return SQL + " ".join(INNER_JOINS)
def get_final_sql(**args):
    xo = args['xo']
    xi = args['xi']
    join=args['join']
    prefix = args['prefix'] if 'prefix' in args else ''
    fields = get_fields (xo=xo,xi=xi,join=join)
    k = len(fields)
    n = np.random.randint(2,k) #-- number of fields to select
    i = np.random.randint(0,k,size=n)
    fields = [name for name in fields if fields.index(name) in i]
    base_sql = generate_sql(xo=xo,xi=xi,prefix)
    SQL = """
        SELECT AVERAGE(count),size,n as selected_features,k as total_features
        FROM(
            SELECT COUNT(*) as count,count(:join) as pop,sum(:n) as N,sum(:k) as k,:fields
            FROM (:sql)
        GROUP BY :fields
        ) 
        order by 1
        
    """.replace(":sql",base_sql)
#     sql = "SELECT :fields FROM :xo.name INNER JOIN :xi.name ON :xi.name.:xi.y = :xo.y "
#     fields = ",".join(get_fields(xo=xi,xi=xi,join=xi['y']))
    
    
#     sql = sql.replace(":fields",fields).replace(":xo.name",xo['name']).replace(":xi.name",xi['name'])
#     sql = sql.replace(":xi.y",xi['y']).replace(":xo.y",xo['y'])
#     return sql
In [33]:
xo = {"name":"person","fields":['person_id','date_of_birth','race','value_as_number']}
xi = [{"name":"measurement","fields":['person_id','value_as_number','value_source_value']}] #,{"name":"observation","fields":["person_id","value_as_string","observation_source_value"]}]
# generate_sql(xo=xo,xi=xi,join="person_id",prefix='raw')
fields = get_fields(xo=xo,xi=xi,join='person_id')
ofields = list(fields)
k = len(fields)
n = np.random.randint(2,k) #-- number of fields to select
i = np.random.randint(0,k,size=n)
fields = [name for name in fields if fields.index(name) in i]
In [34]:
fields
Out[34]:
['person.race', 'person.value_as_number', 'measurement.value_source_value']
In [55]:
xo = {"name":"person","fields":['person_id','date_of_birth','race'],"y":"person_id"}
xi = {"name":"measurements","fields":['person_id','value_as_number','value_source_value'],"y":"person_id"}
generate_sql(xo=xo,xi=xi)
Out[55]:
'SELECT person_id,value_as_number,measurements.value_source_value,measurements.value_as_number,value_source_value FROM person INNER JOIN measurements ON measurements.person_id = person_id '
In [59]:
"""
    We are designing a process that will take two tables that will generate 
"""
import itertools
list(itertools.combinations(['a','b','c'],2))
Out[59]:
[('a', 'b'), ('a', 'c'), ('b', 'c')]
In [6]:
#
# find every table with person id at the very least or a subset of fields
#
np.random.randint(0,4,size=4)
Out[6]:
array([1, 3, 0, 0])
In [90]:
list(set(['a','b']) & set(['a']))
Out[90]:
['a']
In [120]:
x_ = 1
In [10]:
x_ = pd.DataFrame({"group":[1,1,1,1,1], "size":[2,1,1,1,1]})
In [12]:
x_.groupby(['group']).mean()
Out[12]:
size
group
1 1.2
In [ ]:

</html>