You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

11 KiB

None <html lang="en"> <head> </head>
In [66]:
import pandas as pd
import numpy as np
from google.cloud import bigquery as bq

client = bq.Client.from_service_account_json('/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')
In [33]:
xo = ['person_id','date_of_birth','race']
xi = ['person_id','value_as_number','value_source_value']
In [181]:
def get_tables(client,id,fields=[]):
    """
        getting table lists from google
    """
    r = []
    ref = client.dataset(id)
    tables = list(client.list_tables(ref))
    for table in tables :
        ref = table.reference
        schema = client.get_table(ref).schema
        names = [f.name for f in schema]
        x = list(set(names) & set(fields))
        if x  :
            r.append({"name":table.table_id,"fields":names})
    return r
    
def get_fields(**args):
    """
        This function will generate a random set of fields from two tables. Tables are structured as follows 
        {name,fields:[],"y":}, with 
            name     table name (needed to generate sql query)
            fields   list of field names, used in the projection
            y        name of the field to be joined.
        @param xo candidate table in the join
        @param xi candidate table in the join
        @param join field by which the tables can be joined.
    """
    # The set operation will remove redundancies in the field names (not sure it's a good idea)
#     xo = args['xo']['fields']
#     xi = args['xi']['fields']
#     zi = args['xi']['name']
#     return list(set([ ".".join([args['xo']['name'],name]) for name in xo]) | set(['.'.join([args['xi']['name'],name]) for name in xi if name != args['join']]) )
    xo = args['xo']
    fields = [".".join([args['xo']['name'],name]) for name in args['xo']['fields']]
    if not isinstance(args['xi'],list) :
        x_ = [args['xi']]
    else:
        x_ = args['xi']
    for xi in x_ :
        fields += (['.'.join([xi['name'],name]) for name in xi['fields'] if name != args['join']])
    return fields
def generate_sql(**args):
    """
        This function will generate the SQL query for the resulting join
    """
    
    xo = args['xo']
    x_ = args['xi']
    xo_name = ".".join([args['prefix'],xo['name'] ]) if 'prefix' in args else xo['name']
    SQL = "SELECT :fields FROM :xo.name ".replace(":xo.name",xo_name)
    if not isinstance(x_,list):
        x_ = [x_]
    f = []#[".".join([args['xo']['name'],args['join']] )]    
    INNER_JOINS = []
    for xi in x_ :
        xi_name = ".".join([args['prefix'],xi['name'] ]) if 'prefix' in args else xi['name']
        JOIN_SQL = "INNER JOIN :xi.name ON ".replace(':xi.name',xi_name)
        value  = ".".join([xi['name'],args['join']])
        f.append(value) 
        
        ON_SQL = ""
        tmp = []
        for term in f :
            ON_SQL = ":xi.name.:ofield = :xo.name.:ofield".replace(":xo.name",xo['name'])
            ON_SQL = ON_SQL.replace(":xi.name.:ofield",term).replace(":ofield",args['join'])
            tmp.append(ON_SQL)
        INNER_JOINS += [JOIN_SQL + " AND ".join(tmp)]
    return SQL + " ".join(INNER_JOINS)
                
#     sql = "SELECT :fields FROM :xo.name INNER JOIN :xi.name ON :xi.name.:xi.y = :xo.y "
#     fields = ",".join(get_fields(xo=xi,xi=xi,join=xi['y']))
    
    
#     sql = sql.replace(":fields",fields).replace(":xo.name",xo['name']).replace(":xi.name",xi['name'])
#     sql = sql.replace(":xi.y",xi['y']).replace(":xo.y",xo['y'])
#     return sql
In [183]:
xo = {"name":"person","fields":['person_id','date_of_birth','race']}
xi = [{"name":"measurement","fields":['person_id','value_as_number','value_source_value']}] #,{"name":"observation","fields":["person_id","value_as_string","observation_source_value"]}]
generate_sql(xo=xo,xi=xi,join="person_id",prefix='raw')
Out[183]:
'SELECT :fields FROM raw.person INNER JOIN raw.measurement ON measurement.person_id = person.person_id'
In [55]:
xo = {"name":"person","fields":['person_id','date_of_birth','race'],"y":"person_id"}
xi = {"name":"measurements","fields":['person_id','value_as_number','value_source_value'],"y":"person_id"}
generate_sql(xo=xo,xi=xi)
Out[55]:
'SELECT person_id,value_as_number,measurements.value_source_value,measurements.value_as_number,value_source_value FROM person INNER JOIN measurements ON measurements.person_id = person_id '
In [59]:
"""
    We are designing a process that will take two tables that will generate 
"""
import itertools
list(itertools.combinations(['a','b','c'],2))
Out[59]:
[('a', 'b'), ('a', 'c'), ('b', 'c')]
In [111]:
#
# find every table with person id at the very least or a subset of fields
#
info = get_tables(client,'raw',['person_id'])
# get_fields(xo=names[0],xi=names[1:4],join='person_id')

# q = ['person_id']
# pairs = list(itertools.combinations(names,len(names)))
# pairs[0]
Out[111]:
[u'condition_occurrence.condition_occurrence_id',
 u'condition_occurrence.person_id',
 u'condition_occurrence.condition_concept_id',
 u'condition_occurrence.condition_start_date',
 u'condition_occurrence.condition_start_datetime',
 u'condition_occurrence.condition_end_date',
 u'condition_occurrence.condition_end_datetime',
 u'condition_occurrence.condition_type_concept_id',
 u'condition_occurrence.stop_reason',
 u'condition_occurrence.provider_id',
 u'condition_occurrence.visit_occurrence_id',
 u'condition_occurrence.condition_source_value',
 u'condition_occurrence.condition_source_concept_id',
 u'death.death_date',
 u'death.death_datetime',
 u'death.death_type_concept_id',
 u'death.cause_concept_id',
 u'death.cause_source_value',
 u'death.cause_source_concept_id',
 u'device_exposure.device_exposure_id',
 u'device_exposure.device_concept_id',
 u'device_exposure.device_exposure_start_date',
 u'device_exposure.device_exposure_start_datetime',
 u'device_exposure.device_exposure_end_date',
 u'device_exposure.device_exposure_end_datetime',
 u'device_exposure.device_type_concept_id',
 u'device_exposure.unique_device_id',
 u'device_exposure.quantity',
 u'device_exposure.provider_id',
 u'device_exposure.visit_occurrence_id',
 u'device_exposure.device_source_value',
 u'device_exposure.device_source_concept_id',
 u'drug_exposure.drug_exposure_id',
 u'drug_exposure.drug_concept_id',
 u'drug_exposure.drug_exposure_start_date',
 u'drug_exposure.drug_exposure_start_datetime',
 u'drug_exposure.drug_exposure_end_date',
 u'drug_exposure.drug_exposure_end_datetime',
 u'drug_exposure.drug_type_concept_id',
 u'drug_exposure.stop_reason',
 u'drug_exposure.refills',
 u'drug_exposure.quantity',
 u'drug_exposure.days_supply',
 u'drug_exposure.sig',
 u'drug_exposure.route_concept_id',
 u'drug_exposure.effective_drug_dose',
 u'drug_exposure.dose_unit_concept_id',
 u'drug_exposure.lot_number',
 u'drug_exposure.provider_id',
 u'drug_exposure.visit_occurrence_id',
 u'drug_exposure.drug_source_value',
 u'drug_exposure.drug_source_concept_id',
 u'drug_exposure.route_source_value',
 u'drug_exposure.dose_unit_source_value']
In [90]:
list(set(['a','b']) & set(['a']))
Out[90]:
['a']
In [120]:
x_ = 1
In [ ]:

</html>