You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
data-transport/transport/cloud/bigquery.py

159 lines
6.7 KiB
Python

"""
Implementing support for google's bigquery
- cloud.bigquery.Read
- cloud.bigquery.Write
"""
import json
from google.oauth2 import service_account
from google.cloud import bigquery as bq
from multiprocessing import Lock, RLock
import pandas as pd
import pandas_gbq as pd_gbq
import numpy as np
import time
MAX_CHUNK = 2000000
class BigQuery:
def __init__(self,**_args):
path = _args['service_key'] if 'service_key' in _args else _args['private_key']
self.credentials = service_account.Credentials.from_service_account_file(path)
self.dataset = _args['dataset'] if 'dataset' in _args else None
self.path = path
self.dtypes = _args['dtypes'] if 'dtypes' in _args else None
self.table = _args['table'] if 'table' in _args else None
self.client = bq.Client.from_service_account_json(self.path)
def meta(self,**_args):
"""
This function returns meta data for a given table or query with dataset/table properly formatted
:param table name of the name WITHOUT including dataset
:param sql sql query to be pulled,
"""
table = _args['table'] if 'table' in _args else self.table
try:
if table :
_dataset = self.dataset if 'dataset' not in _args else _args['dataset']
sql = f"""SELECT column_name as name, data_type as type FROM {_dataset}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{table}' """
_info = {'credentials':self.credentials,'dialect':'standard'}
return pd_gbq.read_gbq(sql,**_info).to_dict(orient='records')
# return self.read(sql=sql).to_dict(orient='records')
# ref = self.client.dataset(self.dataset).table(table)
# _schema = self.client.get_table(ref).schema
# return [{"name":_item.name,"type":_item.field_type,"description":( "" if not hasattr(_item,"description") else _item.description )} for _item in _schema]
else :
return []
except Exception as e:
return []
def has(self,**_args):
found = False
try:
_has = self.meta(**_args)
found = _has is not None and len(_has) > 0
except Exception as e:
pass
return found
class Reader (BigQuery):
"""
Implementing support for reading from bigquery, This class acts as a wrapper around google's API
"""
def __init__(self,**_args):
super().__init__(**_args)
def apply(self,sql):
return self.read(sql=sql)
def read(self,**_args):
SQL = None
table = self.table if 'table' not in _args else _args['table']
if 'sql' in _args :
SQL = _args['sql']
elif table:
table = "".join(["`",table,"`"]) if '.' in table else "".join(["`:dataset.",table,"`"])
SQL = "SELECT * FROM :table ".replace(":table",table)
if not SQL :
return None
if SQL and 'limit' in _args:
SQL += " LIMIT "+str(_args['limit'])
if (':dataset' in SQL or ':DATASET' in SQL) and self.dataset:
SQL = SQL.replace(':dataset',self.dataset).replace(':DATASET',self.dataset)
_info = {'credentials':self.credentials,'dialect':'standard'}
return pd_gbq.read_gbq(SQL,**_info) if SQL else None
# return self.client.query(SQL).to_dataframe() if SQL else None
class Writer (BigQuery):
"""
This class implements support for writing against bigquery
"""
lock = RLock()
def __init__(self,**_args):
super().__init__(**_args)
self.parallel = False if 'lock' not in _args else _args['lock']
self.table = _args['table'] if 'table' in _args else None
self.mode = {'if_exists':'append','chunksize':900000,'destination_table':self.table,'credentials':self.credentials}
self._chunks = 1 if 'chunks' not in _args else int(_args['chunks'])
self._location = 'US' if 'location' not in _args else _args['location']
def write(self,_data,**_args) :
"""
This function will perform a write to bigquery
:_data data-frame to be written to bigquery
"""
try:
if self.parallel or 'lock' in _args :