You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
159 lines
6.7 KiB
Python
159 lines
6.7 KiB
Python
10 months ago
|
"""
|
||
|
Implementing support for google's bigquery
|
||
|
- cloud.bigquery.Read
|
||
|
- cloud.bigquery.Write
|
||
|
"""
|
||
|
import json
|
||
|
from google.oauth2 import service_account
|
||
|
from google.cloud import bigquery as bq
|
||
|
|
||
|
from multiprocessing import Lock, RLock
|
||
|
import pandas as pd
|
||
|
import pandas_gbq as pd_gbq
|
||
|
import numpy as np
|
||
|
import time
|
||
|
|
||
|
MAX_CHUNK = 2000000
|
||
|
class BigQuery:
|
||
|
def __init__(self,**_args):
|
||
|
path = _args['service_key'] if 'service_key' in _args else _args['private_key']
|
||
|
self.credentials = service_account.Credentials.from_service_account_file(path)
|
||
|
self.dataset = _args['dataset'] if 'dataset' in _args else None
|
||
|
self.path = path
|
||
|
self.dtypes = _args['dtypes'] if 'dtypes' in _args else None
|
||
|
self.table = _args['table'] if 'table' in _args else None
|
||
|
self.client = bq.Client.from_service_account_json(self.path)
|
||
|
def meta(self,**_args):
|
||
|
"""
|
||
|
This function returns meta data for a given table or query with dataset/table properly formatted
|
||
|
:param table name of the name WITHOUT including dataset
|
||
|
:param sql sql query to be pulled,
|
||
|
"""
|
||
|
table = _args['table'] if 'table' in _args else self.table
|
||
|
|
||
|
try:
|
||
|
if table :
|
||
|
_dataset = self.dataset if 'dataset' not in _args else _args['dataset']
|
||
|
sql = f"""SELECT column_name as name, data_type as type FROM {_dataset}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = '{table}' """
|
||
|
_info = {'credentials':self.credentials,'dialect':'standard'}
|
||
|
return pd_gbq.read_gbq(sql,**_info).to_dict(orient='records')
|
||
|
# return self.read(sql=sql).to_dict(orient='records')
|
||
|
# ref = self.client.dataset(self.dataset).table(table)
|
||
|
|
||
|
# _schema = self.client.get_table(ref).schema
|
||
|
# return [{"name":_item.name,"type":_item.field_type,"description":( "" if not hasattr(_item,"description") else _item.description )} for _item in _schema]
|
||
|
else :
|
||
|
return []
|
||
|
except Exception as e:
|
||
|
|
||
|
return []
|
||
|
def has(self,**_args):
|
||
|
found = False
|
||
|
try:
|
||
|
_has = self.meta(**_args)
|
||
|
found = _has is not None and len(_has) > 0
|
||
|
except Exception as e:
|
||
|
pass
|
||
|
return found
|
||
|
class Reader (BigQuery):
|
||
|
"""
|
||
|
Implementing support for reading from bigquery, This class acts as a wrapper around google's API
|
||
|
"""
|
||
|
def __init__(self,**_args):
|
||
|
|
||
|
super().__init__(**_args)
|
||
|
def apply(self,sql):
|
||
|
return self.read(sql=sql)
|
||
|
|
||
|
def read(self,**_args):
|
||
|
SQL = None
|
||
|
table = self.table if 'table' not in _args else _args['table']
|
||
|
if 'sql' in _args :
|
||
|
SQL = _args['sql']
|
||
|
elif table:
|
||
|
|
||
|
table = "".join(["`",table,"`"]) if '.' in table else "".join(["`:dataset.",table,"`"])
|
||
|
SQL = "SELECT * FROM :table ".replace(":table",table)
|
||
|
if not SQL :
|
||
|
return None
|
||
|
if SQL and 'limit' in _args:
|
||
|
SQL += " LIMIT "+str(_args['limit'])
|
||
|
if (':dataset' in SQL or ':DATASET' in SQL) and self.dataset:
|
||
|
SQL = SQL.replace(':dataset',self.dataset).replace(':DATASET',self.dataset)
|
||
|
_info = {'credentials':self.credentials,'dialect':'standard'}
|
||
|
return pd_gbq.read_gbq(SQL,**_info) if SQL else None
|
||
|
# return self.client.query(SQL).to_dataframe() if SQL else None
|
||
|
|
||
|
class Writer (BigQuery):
|
||
|
"""
|
||
|
This class implements support for writing against bigquery
|
||
|
"""
|
||
|
lock = RLock()
|
||
|
def __init__(self,**_args):
|
||
|
super().__init__(**_args)
|
||
|
|
||
|
self.parallel = False if 'lock' not in _args else _args['lock']
|
||
|
self.table = _args['table'] if 'table' in _args else None
|
||
|
self.mode = {'if_exists':'append','chunksize':900000,'destination_table':self.table,'credentials':self.credentials}
|
||
|
self._chunks = 1 if 'chunks' not in _args else int(_args['chunks'])
|
||
|
self._location = 'US' if 'location' not in _args else _args['location']
|
||
|
def write(self,_data,**_args) :
|
||
|
"""
|
||
|
This function will perform a write to bigquery
|
||
|
:_data data-frame to be written to bigquery
|
||
|
"""
|
||
|
try:
|
||
|
if self.parallel or 'lock' in _args :
|