remove module-level BigQuery
This commit is contained in:
@@ -1,23 +1,6 @@
|
|||||||
"""
|
"""
|
||||||
BIGQUERY HELPER FUNCTIONS
|
BIGQUERY HELPER FUNCTIONS
|
||||||
Kimball Data Warehouse Architecture
|
Kimball Data Warehouse Architecture
|
||||||
|
|
||||||
Layer Assignment (Kimball terminology):
|
|
||||||
RAW (Bronze) → raw_fao, raw_worldbank, raw_unicef
|
|
||||||
STAGING (Silver) → staging_integrated, cleaned_integrated
|
|
||||||
AUDIT (Audit) → etl_logs, etl_metadata
|
|
||||||
DW (Gold) → dim_*, fact_food_security, fact_food_security_eligible
|
|
||||||
|
|
||||||
Functions:
|
|
||||||
setup_logging() — Setup file & console logging
|
|
||||||
log_update() — Audit log ETL ke staging (Silver)
|
|
||||||
save_etl_metadata() — Save ETL metadata ke staging (Silver), preserve created_at
|
|
||||||
load_to_bigquery() — Load DataFrame ke layer tertentu
|
|
||||||
read_from_bigquery() — Read dari layer tertentu
|
|
||||||
truncate_table() — Hapus semua rows dari table
|
|
||||||
drop_table() — Drop table dari layer tertentu
|
|
||||||
get_staging_schema() — Schema staging_integrated
|
|
||||||
get_etl_metadata_schema() — Schema etl_metadata
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -25,7 +8,7 @@ import logging
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import pytz
|
import pytz
|
||||||
from google.cloud import bigquery
|
from google.cloud import bigquery
|
||||||
from bigquery_config import (
|
from scripts.bigquery_config import (
|
||||||
get_bigquery_client,
|
get_bigquery_client,
|
||||||
get_table_id,
|
get_table_id,
|
||||||
table_exists,
|
table_exists,
|
||||||
@@ -35,16 +18,9 @@ import json
|
|||||||
|
|
||||||
# LOGGING SETUP
|
# LOGGING SETUP
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def setup_logging(log_file: str = 'logs/etl_pipeline.log') -> logging.Logger:
|
def setup_logging(log_file: str = 'logs/etl_pipeline.log') -> logging.Logger:
|
||||||
"""
|
|
||||||
Setup logging system untuk tracking eksekusi ETL
|
|
||||||
|
|
||||||
Args:
|
|
||||||
log_file: Path to log file
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
logging.Logger: Configured logger
|
|
||||||
"""
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
@@ -55,27 +31,8 @@ def setup_logging(log_file: str = 'logs/etl_pipeline.log') -> logging.Logger:
|
|||||||
)
|
)
|
||||||
return logging.getLogger(__name__)
|
return logging.getLogger(__name__)
|
||||||
|
|
||||||
# ETL AUDIT LOG — STAGING LAYER (Silver)
|
|
||||||
|
|
||||||
def ensure_etl_logs_table(client: bigquery.Client):
|
def ensure_etl_logs_table(client: bigquery.Client):
|
||||||
"""
|
|
||||||
Buat table etl_logs di STAGING layer (Silver) jika belum ada.
|
|
||||||
|
|
||||||
Kimball context:
|
|
||||||
etl_logs adalah operational/audit table, bukan bagian dari Star Schema.
|
|
||||||
Disimpan di Staging layer karena merupakan output proses ETL,
|
|
||||||
bukan data warehouse final.
|
|
||||||
|
|
||||||
Schema:
|
|
||||||
id STRING — unique log ID
|
|
||||||
timestamp DATETIME — waktu log dibuat
|
|
||||||
layer STRING — layer yang diproses (RAW/STAGING/DW)
|
|
||||||
table_name STRING — nama table yang diproses
|
|
||||||
update_method STRING — full_refresh / incremental
|
|
||||||
rows_affected INTEGER — jumlah rows
|
|
||||||
status STRING — success / failed
|
|
||||||
error_message STRING — pesan error jika gagal
|
|
||||||
"""
|
|
||||||
if not table_exists(client, 'etl_logs', layer='audit'):
|
if not table_exists(client, 'etl_logs', layer='audit'):
|
||||||
table_id = get_table_id('etl_logs', layer='audit')
|
table_id = get_table_id('etl_logs', layer='audit')
|
||||||
schema = [
|
schema = [
|
||||||
@@ -96,28 +53,6 @@ def ensure_etl_logs_table(client: bigquery.Client):
|
|||||||
def log_update(client: bigquery.Client, layer: str, table_name: str,
|
def log_update(client: bigquery.Client, layer: str, table_name: str,
|
||||||
update_method: str, rows_affected: int,
|
update_method: str, rows_affected: int,
|
||||||
status: str = 'success', error_msg: str = None):
|
status: str = 'success', error_msg: str = None):
|
||||||
"""
|
|
||||||
Catat aktivitas ETL ke etl_logs (STAGING/Silver) untuk audit trail.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
client : BigQuery client
|
|
||||||
layer : Layer yang diproses — 'RAW', 'STAGING', atau 'DW'
|
|
||||||
table_name : Nama table yang diproses
|
|
||||||
update_method : 'full_refresh' atau 'incremental'
|
|
||||||
rows_affected : Jumlah rows yang diproses
|
|
||||||
status : 'success' atau 'failed'
|
|
||||||
error_msg : Pesan error jika status='failed'
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
# Log saat load raw data
|
|
||||||
log_update(client, 'RAW', 'raw_fao', 'full_refresh', 5000)
|
|
||||||
|
|
||||||
# Log saat proses staging
|
|
||||||
log_update(client, 'STAGING', 'staging_integrated', 'full_refresh', 12000)
|
|
||||||
|
|
||||||
# Log saat load ke DW
|
|
||||||
log_update(client, 'DW', 'fact_food_security', 'full_refresh', 8000)
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
ensure_etl_logs_table(client)
|
ensure_etl_logs_table(client)
|
||||||
|
|
||||||
@@ -133,7 +68,6 @@ def log_update(client: bigquery.Client, layer: str, table_name: str,
|
|||||||
'error_message': error_msg
|
'error_message': error_msg
|
||||||
}])
|
}])
|
||||||
|
|
||||||
# Hapus timezone untuk BigQuery DATETIME
|
|
||||||
log_data['timestamp'] = pd.to_datetime(log_data['timestamp']).dt.tz_localize(None)
|
log_data['timestamp'] = pd.to_datetime(log_data['timestamp']).dt.tz_localize(None)
|
||||||
log_data['id'] = log_data['id'].astype(str)
|
log_data['id'] = log_data['id'].astype(str)
|
||||||
|
|
||||||
@@ -145,40 +79,11 @@ def log_update(client: bigquery.Client, layer: str, table_name: str,
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" Warning: Failed to write etl_logs [STAGING]: {e}")
|
print(f" Warning: Failed to write etl_logs [STAGING]: {e}")
|
||||||
|
|
||||||
# DATA LOADING TO BIGQUERY
|
|
||||||
|
|
||||||
def load_to_bigquery(client: bigquery.Client, df: pd.DataFrame,
|
def load_to_bigquery(client: bigquery.Client, df: pd.DataFrame,
|
||||||
table_name: str, layer: str = "bronze",
|
table_name: str, layer: str = "bronze",
|
||||||
write_disposition: str = "WRITE_TRUNCATE",
|
write_disposition: str = "WRITE_TRUNCATE",
|
||||||
schema: list = None) -> int:
|
schema: list = None) -> int:
|
||||||
"""
|
|
||||||
Load DataFrame ke BigQuery table pada layer tertentu.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
client : BigQuery client
|
|
||||||
df : DataFrame yang akan di-load
|
|
||||||
table_name : Nama table tujuan
|
|
||||||
layer : 'bronze'/'raw', 'silver'/'staging', 'gold'/'dw'
|
|
||||||
write_disposition : WRITE_TRUNCATE (replace) atau WRITE_APPEND (append)
|
|
||||||
schema : Optional schema (list of SchemaField)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
int: Jumlah rows yang berhasil di-load
|
|
||||||
|
|
||||||
Examples (Kimball flow):
|
|
||||||
# RAW layer — data mentah dari sumber
|
|
||||||
load_to_bigquery(client, df_fao, 'raw_fao', layer='bronze')
|
|
||||||
load_to_bigquery(client, df_wb, 'raw_worldbank', layer='bronze')
|
|
||||||
load_to_bigquery(client, df_unicef, 'raw_unicef', layer='bronze')
|
|
||||||
|
|
||||||
# STAGING layer — cleaned & integrated
|
|
||||||
load_to_bigquery(client, df_staging, 'staging_integrated', layer='silver')
|
|
||||||
|
|
||||||
# DW layer — Kimball Star Schema
|
|
||||||
load_to_bigquery(client, df_dim, 'dim_country', layer='gold')
|
|
||||||
load_to_bigquery(client, df_fact, 'fact_food_security', layer='gold')
|
|
||||||
load_to_bigquery(client, df_elig, 'fact_food_security_eligible', layer='gold')
|
|
||||||
"""
|
|
||||||
table_id = get_table_id(table_name, layer)
|
table_id = get_table_id(table_name, layer)
|
||||||
job_config = bigquery.LoadJobConfig(
|
job_config = bigquery.LoadJobConfig(
|
||||||
write_disposition=write_disposition,
|
write_disposition=write_disposition,
|
||||||
@@ -193,36 +98,11 @@ def load_to_bigquery(client: bigquery.Client, df: pd.DataFrame,
|
|||||||
print(f" ✓ Loaded {table.num_rows:,} rows → [{layer.upper()}] {table_name}")
|
print(f" ✓ Loaded {table.num_rows:,} rows → [{layer.upper()}] {table_name}")
|
||||||
return table.num_rows
|
return table.num_rows
|
||||||
|
|
||||||
# DATA READING FROM BIGQUERY
|
|
||||||
|
|
||||||
def read_from_bigquery(client: bigquery.Client,
|
def read_from_bigquery(client: bigquery.Client,
|
||||||
table_name: str = None,
|
table_name: str = None,
|
||||||
layer: str = "bronze",
|
layer: str = "bronze",
|
||||||
query: str = None) -> pd.DataFrame:
|
query: str = None) -> pd.DataFrame:
|
||||||
"""
|
|
||||||
Read data dari BigQuery table atau jalankan custom query.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
client : BigQuery client
|
|
||||||
table_name : Nama table yang akan dibaca
|
|
||||||
layer : 'bronze'/'raw', 'silver'/'staging', 'gold'/'dw'
|
|
||||||
query : Custom SQL query (jika diisi, table_name diabaikan)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
pd.DataFrame: Hasil query
|
|
||||||
|
|
||||||
Examples (Kimball flow):
|
|
||||||
# Baca dari RAW layer
|
|
||||||
df = read_from_bigquery(client, 'raw_fao', layer='bronze')
|
|
||||||
|
|
||||||
# Baca dari STAGING layer
|
|
||||||
df = read_from_bigquery(client, 'staging_integrated', layer='silver')
|
|
||||||
|
|
||||||
# Baca dari DW layer
|
|
||||||
df = read_from_bigquery(client, 'fact_food_security', layer='gold')
|
|
||||||
df = read_from_bigquery(client, 'fact_food_security_eligible', layer='gold')
|
|
||||||
df = read_from_bigquery(client, 'dim_country', layer='gold')
|
|
||||||
"""
|
|
||||||
if query:
|
if query:
|
||||||
return client.query(query).result().to_dataframe(create_bqstorage_client=False)
|
return client.query(query).result().to_dataframe(create_bqstorage_client=False)
|
||||||
elif table_name:
|
elif table_name:
|
||||||
@@ -231,17 +111,8 @@ def read_from_bigquery(client: bigquery.Client,
|
|||||||
else:
|
else:
|
||||||
raise ValueError("Either table_name or query must be provided")
|
raise ValueError("Either table_name or query must be provided")
|
||||||
|
|
||||||
# TABLE MANAGEMENT
|
|
||||||
|
|
||||||
def truncate_table(client: bigquery.Client, table_name: str, layer: str = "bronze"):
|
def truncate_table(client: bigquery.Client, table_name: str, layer: str = "bronze"):
|
||||||
"""
|
|
||||||
Hapus semua rows dari table (kosongkan table, struktur tetap ada).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
client : BigQuery client
|
|
||||||
table_name : Nama table
|
|
||||||
layer : 'bronze'/'raw', 'silver'/'staging', 'gold'/'dw'
|
|
||||||
"""
|
|
||||||
table_id = get_table_id(table_name, layer)
|
table_id = get_table_id(table_name, layer)
|
||||||
job = client.query(f"DELETE FROM `{table_id}` WHERE TRUE")
|
job = client.query(f"DELETE FROM `{table_id}` WHERE TRUE")
|
||||||
job.result()
|
job.result()
|
||||||
@@ -249,30 +120,12 @@ def truncate_table(client: bigquery.Client, table_name: str, layer: str = "bronz
|
|||||||
|
|
||||||
|
|
||||||
def drop_table(client: bigquery.Client, table_name: str, layer: str = "bronze"):
|
def drop_table(client: bigquery.Client, table_name: str, layer: str = "bronze"):
|
||||||
"""
|
|
||||||
Drop table dari BigQuery jika ada.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
client : BigQuery client
|
|
||||||
table_name : Nama table
|
|
||||||
layer : 'bronze'/'raw', 'silver'/'staging', 'gold'/'dw'
|
|
||||||
"""
|
|
||||||
table_id = get_table_id(table_name, layer)
|
table_id = get_table_id(table_name, layer)
|
||||||
client.delete_table(table_id, not_found_ok=True)
|
client.delete_table(table_id, not_found_ok=True)
|
||||||
print(f" Dropped [{layer.upper()}] table: {table_name}")
|
print(f" Dropped [{layer.upper()}] table: {table_name}")
|
||||||
|
|
||||||
# SCHEMA DEFINITIONS — STAGING LAYER (Silver)
|
|
||||||
|
|
||||||
def get_staging_schema() -> list:
|
def get_staging_schema() -> list:
|
||||||
"""
|
|
||||||
Schema untuk staging_integrated table (STAGING/Silver layer).
|
|
||||||
|
|
||||||
Staging table adalah area integrasi data dari semua sumber (FAO, WB, UNICEF)
|
|
||||||
sebelum di-load ke DW layer sebagai Dim & Fact tables.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list: List of SchemaField objects
|
|
||||||
"""
|
|
||||||
return [
|
return [
|
||||||
bigquery.SchemaField("source", "STRING", mode="REQUIRED"),
|
bigquery.SchemaField("source", "STRING", mode="REQUIRED"),
|
||||||
bigquery.SchemaField("indicator_original", "STRING", mode="REQUIRED"),
|
bigquery.SchemaField("indicator_original", "STRING", mode="REQUIRED"),
|
||||||
@@ -286,15 +139,6 @@ def get_staging_schema() -> list:
|
|||||||
|
|
||||||
|
|
||||||
def get_etl_metadata_schema() -> list:
|
def get_etl_metadata_schema() -> list:
|
||||||
"""
|
|
||||||
Schema untuk etl_metadata table (STAGING/Silver layer).
|
|
||||||
|
|
||||||
ETL metadata disimpan di Staging layer karena merupakan operational table
|
|
||||||
untuk reproducibility & tracking, bukan bagian Star Schema DW.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list: List of SchemaField objects
|
|
||||||
"""
|
|
||||||
return [
|
return [
|
||||||
bigquery.SchemaField("id", "STRING", mode="REQUIRED"),
|
bigquery.SchemaField("id", "STRING", mode="REQUIRED"),
|
||||||
bigquery.SchemaField("source_class", "STRING", mode="REQUIRED"),
|
bigquery.SchemaField("source_class", "STRING", mode="REQUIRED"),
|
||||||
@@ -311,43 +155,17 @@ def get_etl_metadata_schema() -> list:
|
|||||||
bigquery.SchemaField("updated_at", "TIMESTAMP", mode="REQUIRED"),
|
bigquery.SchemaField("updated_at", "TIMESTAMP", mode="REQUIRED"),
|
||||||
]
|
]
|
||||||
|
|
||||||
# ETL METADATA — STAGING LAYER (Silver)
|
|
||||||
# FIXED: Preserve created_at dari eksekusi pertama
|
|
||||||
|
|
||||||
def save_etl_metadata(client: bigquery.Client, metadata: dict):
|
def save_etl_metadata(client: bigquery.Client, metadata: dict):
|
||||||
"""
|
|
||||||
Save ETL metadata ke etl_metadata table (STAGING/Silver layer).
|
|
||||||
|
|
||||||
Logic created_at vs updated_at:
|
|
||||||
created_at : diambil dari record PERTAMA untuk table_name yang sama
|
|
||||||
(preserved across runs — untuk reproducibility)
|
|
||||||
updated_at : selalu diperbarui ke waktu eksekusi sekarang
|
|
||||||
|
|
||||||
Args:
|
|
||||||
client : BigQuery client
|
|
||||||
metadata : Dict berisi informasi eksekusi ETL:
|
|
||||||
table_name (required)
|
|
||||||
source_class (required)
|
|
||||||
execution_timestamp
|
|
||||||
duration_seconds
|
|
||||||
rows_fetched
|
|
||||||
rows_transformed
|
|
||||||
rows_loaded
|
|
||||||
completeness_pct
|
|
||||||
config_snapshot (JSON string)
|
|
||||||
validation_metrics (JSON string)
|
|
||||||
"""
|
|
||||||
table_name = metadata.get('table_name', 'unknown')
|
table_name = metadata.get('table_name', 'unknown')
|
||||||
table_id = get_table_id('etl_metadata', layer='audit')
|
table_id = get_table_id('etl_metadata', layer='audit')
|
||||||
|
|
||||||
# Buat table jika belum ada
|
|
||||||
if not table_exists(client, 'etl_metadata', layer='audit'):
|
if not table_exists(client, 'etl_metadata', layer='audit'):
|
||||||
schema = get_etl_metadata_schema()
|
schema = get_etl_metadata_schema()
|
||||||
table = bigquery.Table(table_id, schema=schema)
|
table = bigquery.Table(table_id, schema=schema)
|
||||||
client.create_table(table)
|
client.create_table(table)
|
||||||
print(f" [AUDIT] Created table: etl_metadata")
|
print(f" [AUDIT] Created table: etl_metadata")
|
||||||
|
|
||||||
# Ambil created_at pertama untuk table ini (preserve across runs)
|
|
||||||
check_query = f"""
|
check_query = f"""
|
||||||
SELECT MIN(created_at) AS first_created_at
|
SELECT MIN(created_at) AS first_created_at
|
||||||
FROM `{table_id}`
|
FROM `{table_id}`
|
||||||
@@ -373,7 +191,6 @@ def save_etl_metadata(client: bigquery.Client, metadata: dict):
|
|||||||
|
|
||||||
current_time = datetime.now()
|
current_time = datetime.now()
|
||||||
|
|
||||||
# Generate unique ID
|
|
||||||
import hashlib
|
import hashlib
|
||||||
record_id = hashlib.md5(
|
record_id = hashlib.md5(
|
||||||
f"{metadata.get('source_class')}_{table_name}_{current_time.isoformat()}".encode()
|
f"{metadata.get('source_class')}_{table_name}_{current_time.isoformat()}".encode()
|
||||||
@@ -391,15 +208,13 @@ def save_etl_metadata(client: bigquery.Client, metadata: dict):
|
|||||||
'completeness_pct' : float(metadata.get('completeness_pct', 0)),
|
'completeness_pct' : float(metadata.get('completeness_pct', 0)),
|
||||||
'config_snapshot' : metadata.get('config_snapshot', '{}'),
|
'config_snapshot' : metadata.get('config_snapshot', '{}'),
|
||||||
'validation_metrics' : metadata.get('validation_metrics', '{}'),
|
'validation_metrics' : metadata.get('validation_metrics', '{}'),
|
||||||
'created_at' : created_at, # PRESERVED dari run pertama
|
'created_at' : created_at,
|
||||||
'updated_at' : current_time # SELALU waktu sekarang
|
'updated_at' : current_time
|
||||||
}])
|
}])
|
||||||
|
|
||||||
# Hapus timezone untuk BigQuery
|
|
||||||
for col in ['execution_timestamp', 'created_at', 'updated_at']:
|
for col in ['execution_timestamp', 'created_at', 'updated_at']:
|
||||||
meta_df[col] = pd.to_datetime(meta_df[col]).dt.tz_localize(None)
|
meta_df[col] = pd.to_datetime(meta_df[col]).dt.tz_localize(None)
|
||||||
|
|
||||||
# APPEND ke STAGING layer (Silver)
|
|
||||||
job_config = bigquery.LoadJobConfig(write_disposition="WRITE_APPEND")
|
job_config = bigquery.LoadJobConfig(write_disposition="WRITE_APPEND")
|
||||||
job = client.load_table_from_dataframe(meta_df, table_id, job_config=job_config)
|
job = client.load_table_from_dataframe(meta_df, table_id, job_config=job_config)
|
||||||
job.result()
|
job.result()
|
||||||
@@ -409,14 +224,3 @@ def save_etl_metadata(client: bigquery.Client, metadata: dict):
|
|||||||
else:
|
else:
|
||||||
print(f"etl_metadata — preserved | created_at : {created_at}")
|
print(f"etl_metadata — preserved | created_at : {created_at}")
|
||||||
print(f"etl_metadata — updated_at : {current_time}")
|
print(f"etl_metadata — updated_at : {current_time}")
|
||||||
|
|
||||||
# INITIALIZE
|
|
||||||
|
|
||||||
logger = setup_logging()
|
|
||||||
client = get_bigquery_client()
|
|
||||||
|
|
||||||
print("BigQuery Helpers Loaded — Kimball DW Architecture")
|
|
||||||
print(f"Project : {CONFIG['bigquery']['project_id']}")
|
|
||||||
print(f"Raw (Bronze) : {CONFIG['bigquery']['dataset_bronze']}")
|
|
||||||
print(f"Staging (Silver) : {CONFIG['bigquery']['dataset_silver']}")
|
|
||||||
print(f"DW (Gold) : {CONFIG['bigquery']['dataset_gold']}")
|
|
||||||
Reference in New Issue
Block a user