privacykit

7.3 KiB

Raw Blame History

None <html lang="en"> <head> </head>

In [66]:

import pandas as pd
import numpy as np
from google.cloud import bigquery as bq

client = bq.Client.from_service_account_json('/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')

In [33]:

xo = ['person_id','date_of_birth','race']
xi = ['person_id','value_as_number','value_source_value']

In [53]:

def get_tables(client,did,fields=[]):
    """
        getting table lists from google
    """
    r = []
    ref = client.dataset(id)
    tables = list(client.list_tables(ref))
    for table in tables :
        ref = table.reference
        schema = client.get_table(ref).schema
        names = [f.field_name for f in schema]
        x = list(set(names) & set(fields))
        if x  :
            r.append({"name":table.table_id,"fields":names})
    return r
    
def get_fields(**args):
    """
        This function will generate a random set of fields from two tables. Tables are structured as follows 
        {name,fields:[],"y":}, with 
            name     table name (needed to generate sql query)
            fields   list of field names, used in the projection
            y        name of the field to be joined.
        @param xo candidate table in the join
        @param xi candidate table in the join
        @param join field by which the tables can be joined.
    """
    # The set operation will remove redundancies in the field names (not sure it's a good idea)
    xo = args['xo']['fields']
    xi = args['xi']['fields']
    zi = args['xi']['name']
    return list(set(xo) | set(['.'.join([args['xi']['name'],name]) for name in xi if name != args['join']]) )
def generate_sql(**args):
    """
        This function will generate the SQL query for the resulting join
    """
    xo = args['xo']
    xi = args['xi']
    sql = "SELECT :fields FROM :xo.name INNER JOIN :xi.name ON :xi.name.:xi.y = :xo.y "
    fields = ",".join(get_fields(xo=xi,xi=xi,join=xi['y']))
    
    
    sql = sql.replace(":fields",fields).replace(":xo.name",xo['name']).replace(":xi.name",xi['name'])
    sql = sql.replace(":xi.y",xi['y']).replace(":xo.y",xo['y'])
    return sql

In [54]:

xo = {"name":"person","fields":['person_id','date_of_birth','race']}
xi = {"name":"measurements","fields":['person_id','value_as_number','value_source_value']}
get_fields(xo=xo,xi=xi,join="person_id")

Out[54]:

['person_id',
 'measurements.value_as_number',
 'date_of_birth',
 'race',
 'measurements.value_source_value']

In [55]:

xo = {"name":"person","fields":['person_id','date_of_birth','race'],"y":"person_id"}
xi = {"name":"measurements","fields":['person_id','value_as_number','value_source_value'],"y":"person_id"}
generate_sql(xo=xo,xi=xi)

Out[55]:

'SELECT person_id,value_as_number,measurements.value_source_value,measurements.value_as_number,value_source_value FROM person INNER JOIN measurements ON measurements.person_id = person_id '

In [59]:

"""
    We are designing a process that will take two tables that will generate 
"""
import itertools
list(itertools.combinations(['a','b','c'],2))

Out[59]:

[('a', 'b'), ('a', 'c'), ('b', 'c')]

In [87]:

ref = client.dataset('raw')
tables = list(client.list_tables(ref))
names = [table.table_id for table in tables]
(tables[0].reference)

Out[87]:

TableReference(DatasetReference(u'aou-res-deid-vumc-test', u'raw'), 'care_site')

In [85]:

#
# find every table with person id at the very least or a subset of fields
#
def get_tables
q = ['person_id']
pairs = list(itertools.combinations(names,len(names)))
pairs[0]

Out[85]:

(u'care_site',
 u'concept',
 u'concept_ancestor',
 u'concept_class',
 u'concept_relationship',
 u'concept_synonym',
 u'condition_occurrence',
 u'criteria',
 u'death',
 u'device_exposure',
 u'domain',
 u'drug_exposure',
 u'drug_strength',
 u'location',
 u'measurement',
 u'note',
 u'observation',
 u'people_seed',
 u'person',
 u'procedure_occurrence',
 u'relationship',
 u'visit_occurrence',
 u'vocabulary')

In [90]:

list(set(['a','b']) & set(['a']))

Out[90]:

['a']

In [ ]:

</html>

7.3 KiB Raw Blame History

7.3 KiB

Raw Blame History