You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
7.3 KiB
7.3 KiB
None
<html lang="en">
<head>
</head>
</html>
In [66]:
import pandas as pd
import numpy as np
from google.cloud import bigquery as bq
client = bq.Client.from_service_account_json('/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')
In [33]:
xo = ['person_id','date_of_birth','race']
xi = ['person_id','value_as_number','value_source_value']
In [53]:
def get_tables(client,did,fields=[]):
"""
getting table lists from google
"""
r = []
ref = client.dataset(id)
tables = list(client.list_tables(ref))
for table in tables :
ref = table.reference
schema = client.get_table(ref).schema
names = [f.field_name for f in schema]
x = list(set(names) & set(fields))
if x :
r.append({"name":table.table_id,"fields":names})
return r
def get_fields(**args):
"""
This function will generate a random set of fields from two tables. Tables are structured as follows
{name,fields:[],"y":}, with
name table name (needed to generate sql query)
fields list of field names, used in the projection
y name of the field to be joined.
@param xo candidate table in the join
@param xi candidate table in the join
@param join field by which the tables can be joined.
"""
# The set operation will remove redundancies in the field names (not sure it's a good idea)
xo = args['xo']['fields']
xi = args['xi']['fields']
zi = args['xi']['name']
return list(set(xo) | set(['.'.join([args['xi']['name'],name]) for name in xi if name != args['join']]) )
def generate_sql(**args):
"""
This function will generate the SQL query for the resulting join
"""
xo = args['xo']
xi = args['xi']
sql = "SELECT :fields FROM :xo.name INNER JOIN :xi.name ON :xi.name.:xi.y = :xo.y "
fields = ",".join(get_fields(xo=xi,xi=xi,join=xi['y']))
sql = sql.replace(":fields",fields).replace(":xo.name",xo['name']).replace(":xi.name",xi['name'])
sql = sql.replace(":xi.y",xi['y']).replace(":xo.y",xo['y'])
return sql
In [54]:
xo = {"name":"person","fields":['person_id','date_of_birth','race']}
xi = {"name":"measurements","fields":['person_id','value_as_number','value_source_value']}
get_fields(xo=xo,xi=xi,join="person_id")
Out[54]:
In [55]:
xo = {"name":"person","fields":['person_id','date_of_birth','race'],"y":"person_id"}
xi = {"name":"measurements","fields":['person_id','value_as_number','value_source_value'],"y":"person_id"}
generate_sql(xo=xo,xi=xi)
Out[55]:
In [59]:
"""
We are designing a process that will take two tables that will generate
"""
import itertools
list(itertools.combinations(['a','b','c'],2))
Out[59]:
In [87]:
ref = client.dataset('raw')
tables = list(client.list_tables(ref))
names = [table.table_id for table in tables]
(tables[0].reference)
Out[87]:
In [85]:
#
# find every table with person id at the very least or a subset of fields
#
def get_tables
q = ['person_id']
pairs = list(itertools.combinations(names,len(names)))
pairs[0]
Out[85]:
In [90]:
list(set(['a','b']) & set(['a']))
Out[90]:
In [ ]: