422 lines
16 KiB
Python
422 lines
16 KiB
Python
"""
|
|
BIGQUERY HELPER FUNCTIONS
|
|
Kimball Data Warehouse Architecture
|
|
|
|
Layer Assignment (Kimball terminology):
|
|
RAW (Bronze) → raw_fao, raw_worldbank, raw_unicef
|
|
STAGING (Silver) → staging_integrated, cleaned_integrated
|
|
AUDIT (Audit) → etl_logs, etl_metadata
|
|
DW (Gold) → dim_*, fact_food_security, fact_food_security_eligible
|
|
|
|
Functions:
|
|
setup_logging() — Setup file & console logging
|
|
log_update() — Audit log ETL ke staging (Silver)
|
|
save_etl_metadata() — Save ETL metadata ke staging (Silver), preserve created_at
|
|
load_to_bigquery() — Load DataFrame ke layer tertentu
|
|
read_from_bigquery() — Read dari layer tertentu
|
|
truncate_table() — Hapus semua rows dari table
|
|
drop_table() — Drop table dari layer tertentu
|
|
get_staging_schema() — Schema staging_integrated
|
|
get_etl_metadata_schema() — Schema etl_metadata
|
|
"""
|
|
|
|
import pandas as pd
|
|
import logging
|
|
from datetime import datetime
|
|
import pytz
|
|
from google.cloud import bigquery
|
|
from bigquery_config import (
|
|
get_bigquery_client,
|
|
get_table_id,
|
|
table_exists,
|
|
CONFIG
|
|
)
|
|
import json
|
|
|
|
# LOGGING SETUP
|
|
|
|
def setup_logging(log_file: str = 'logs/etl_pipeline.log') -> logging.Logger:
|
|
"""
|
|
Setup logging system untuk tracking eksekusi ETL
|
|
|
|
Args:
|
|
log_file: Path to log file
|
|
|
|
Returns:
|
|
logging.Logger: Configured logger
|
|
"""
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler(log_file),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
return logging.getLogger(__name__)
|
|
|
|
# ETL AUDIT LOG — STAGING LAYER (Silver)
|
|
|
|
def ensure_etl_logs_table(client: bigquery.Client):
|
|
"""
|
|
Buat table etl_logs di STAGING layer (Silver) jika belum ada.
|
|
|
|
Kimball context:
|
|
etl_logs adalah operational/audit table, bukan bagian dari Star Schema.
|
|
Disimpan di Staging layer karena merupakan output proses ETL,
|
|
bukan data warehouse final.
|
|
|
|
Schema:
|
|
id STRING — unique log ID
|
|
timestamp DATETIME — waktu log dibuat
|
|
layer STRING — layer yang diproses (RAW/STAGING/DW)
|
|
table_name STRING — nama table yang diproses
|
|
update_method STRING — full_refresh / incremental
|
|
rows_affected INTEGER — jumlah rows
|
|
status STRING — success / failed
|
|
error_message STRING — pesan error jika gagal
|
|
"""
|
|
if not table_exists(client, 'etl_logs', layer='audit'):
|
|
table_id = get_table_id('etl_logs', layer='audit')
|
|
schema = [
|
|
bigquery.SchemaField("id", "STRING", mode="REQUIRED"),
|
|
bigquery.SchemaField("timestamp", "DATETIME", mode="REQUIRED"),
|
|
bigquery.SchemaField("layer", "STRING", mode="REQUIRED"),
|
|
bigquery.SchemaField("table_name", "STRING", mode="REQUIRED"),
|
|
bigquery.SchemaField("update_method", "STRING", mode="REQUIRED"),
|
|
bigquery.SchemaField("rows_affected", "INTEGER", mode="NULLABLE"),
|
|
bigquery.SchemaField("status", "STRING", mode="NULLABLE"),
|
|
bigquery.SchemaField("error_message", "STRING", mode="NULLABLE"),
|
|
]
|
|
table = bigquery.Table(table_id, schema=schema)
|
|
client.create_table(table)
|
|
print(f" [AUDIT] Created table: etl_logs")
|
|
|
|
|
|
def log_update(client: bigquery.Client, layer: str, table_name: str,
|
|
update_method: str, rows_affected: int,
|
|
status: str = 'success', error_msg: str = None):
|
|
"""
|
|
Catat aktivitas ETL ke etl_logs (STAGING/Silver) untuk audit trail.
|
|
|
|
Args:
|
|
client : BigQuery client
|
|
layer : Layer yang diproses — 'RAW', 'STAGING', atau 'DW'
|
|
table_name : Nama table yang diproses
|
|
update_method : 'full_refresh' atau 'incremental'
|
|
rows_affected : Jumlah rows yang diproses
|
|
status : 'success' atau 'failed'
|
|
error_msg : Pesan error jika status='failed'
|
|
|
|
Examples:
|
|
# Log saat load raw data
|
|
log_update(client, 'RAW', 'raw_fao', 'full_refresh', 5000)
|
|
|
|
# Log saat proses staging
|
|
log_update(client, 'STAGING', 'staging_integrated', 'full_refresh', 12000)
|
|
|
|
# Log saat load ke DW
|
|
log_update(client, 'DW', 'fact_food_security', 'full_refresh', 8000)
|
|
"""
|
|
try:
|
|
ensure_etl_logs_table(client)
|
|
|
|
log_data = pd.DataFrame([{
|
|
'id' : str(pd.util.hash_pandas_object(
|
|
pd.Series([datetime.now().isoformat()])).values[0]),
|
|
'timestamp' : datetime.now(pytz.timezone('Asia/Jakarta')),
|
|
'layer' : layer.upper(),
|
|
'table_name' : table_name,
|
|
'update_method': update_method,
|
|
'rows_affected': rows_affected,
|
|
'status' : status,
|
|
'error_message': error_msg
|
|
}])
|
|
|
|
# Hapus timezone untuk BigQuery DATETIME
|
|
log_data['timestamp'] = pd.to_datetime(log_data['timestamp']).dt.tz_localize(None)
|
|
log_data['id'] = log_data['id'].astype(str)
|
|
|
|
table_id = get_table_id('etl_logs', layer='audit')
|
|
job_config = bigquery.LoadJobConfig(write_disposition="WRITE_APPEND")
|
|
job = client.load_table_from_dataframe(log_data, table_id, job_config=job_config)
|
|
job.result()
|
|
|
|
except Exception as e:
|
|
print(f" Warning: Failed to write etl_logs [STAGING]: {e}")
|
|
|
|
# DATA LOADING TO BIGQUERY
|
|
|
|
def load_to_bigquery(client: bigquery.Client, df: pd.DataFrame,
|
|
table_name: str, layer: str = "bronze",
|
|
write_disposition: str = "WRITE_TRUNCATE",
|
|
schema: list = None) -> int:
|
|
"""
|
|
Load DataFrame ke BigQuery table pada layer tertentu.
|
|
|
|
Args:
|
|
client : BigQuery client
|
|
df : DataFrame yang akan di-load
|
|
table_name : Nama table tujuan
|
|
layer : 'bronze'/'raw', 'silver'/'staging', 'gold'/'dw'
|
|
write_disposition : WRITE_TRUNCATE (replace) atau WRITE_APPEND (append)
|
|
schema : Optional schema (list of SchemaField)
|
|
|
|
Returns:
|
|
int: Jumlah rows yang berhasil di-load
|
|
|
|
Examples (Kimball flow):
|
|
# RAW layer — data mentah dari sumber
|
|
load_to_bigquery(client, df_fao, 'raw_fao', layer='bronze')
|
|
load_to_bigquery(client, df_wb, 'raw_worldbank', layer='bronze')
|
|
load_to_bigquery(client, df_unicef, 'raw_unicef', layer='bronze')
|
|
|
|
# STAGING layer — cleaned & integrated
|
|
load_to_bigquery(client, df_staging, 'staging_integrated', layer='silver')
|
|
|
|
# DW layer — Kimball Star Schema
|
|
load_to_bigquery(client, df_dim, 'dim_country', layer='gold')
|
|
load_to_bigquery(client, df_fact, 'fact_food_security', layer='gold')
|
|
load_to_bigquery(client, df_elig, 'fact_food_security_eligible', layer='gold')
|
|
"""
|
|
table_id = get_table_id(table_name, layer)
|
|
job_config = bigquery.LoadJobConfig(
|
|
write_disposition=write_disposition,
|
|
autodetect=True if schema is None else False,
|
|
schema=schema
|
|
)
|
|
|
|
job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
|
|
job.result()
|
|
|
|
table = client.get_table(table_id)
|
|
print(f" ✓ Loaded {table.num_rows:,} rows → [{layer.upper()}] {table_name}")
|
|
return table.num_rows
|
|
|
|
# DATA READING FROM BIGQUERY
|
|
|
|
def read_from_bigquery(client: bigquery.Client,
|
|
table_name: str = None,
|
|
layer: str = "bronze",
|
|
query: str = None) -> pd.DataFrame:
|
|
"""
|
|
Read data dari BigQuery table atau jalankan custom query.
|
|
|
|
Args:
|
|
client : BigQuery client
|
|
table_name : Nama table yang akan dibaca
|
|
layer : 'bronze'/'raw', 'silver'/'staging', 'gold'/'dw'
|
|
query : Custom SQL query (jika diisi, table_name diabaikan)
|
|
|
|
Returns:
|
|
pd.DataFrame: Hasil query
|
|
|
|
Examples (Kimball flow):
|
|
# Baca dari RAW layer
|
|
df = read_from_bigquery(client, 'raw_fao', layer='bronze')
|
|
|
|
# Baca dari STAGING layer
|
|
df = read_from_bigquery(client, 'staging_integrated', layer='silver')
|
|
|
|
# Baca dari DW layer
|
|
df = read_from_bigquery(client, 'fact_food_security', layer='gold')
|
|
df = read_from_bigquery(client, 'fact_food_security_eligible', layer='gold')
|
|
df = read_from_bigquery(client, 'dim_country', layer='gold')
|
|
"""
|
|
if query:
|
|
return client.query(query).result().to_dataframe(create_bqstorage_client=False)
|
|
elif table_name:
|
|
table_id = get_table_id(table_name, layer)
|
|
return client.query(f"SELECT * FROM `{table_id}`").result().to_dataframe(create_bqstorage_client=False)
|
|
else:
|
|
raise ValueError("Either table_name or query must be provided")
|
|
|
|
# TABLE MANAGEMENT
|
|
|
|
def truncate_table(client: bigquery.Client, table_name: str, layer: str = "bronze"):
|
|
"""
|
|
Hapus semua rows dari table (kosongkan table, struktur tetap ada).
|
|
|
|
Args:
|
|
client : BigQuery client
|
|
table_name : Nama table
|
|
layer : 'bronze'/'raw', 'silver'/'staging', 'gold'/'dw'
|
|
"""
|
|
table_id = get_table_id(table_name, layer)
|
|
job = client.query(f"DELETE FROM `{table_id}` WHERE TRUE")
|
|
job.result()
|
|
print(f" Truncated [{layer.upper()}] table: {table_name}")
|
|
|
|
|
|
def drop_table(client: bigquery.Client, table_name: str, layer: str = "bronze"):
|
|
"""
|
|
Drop table dari BigQuery jika ada.
|
|
|
|
Args:
|
|
client : BigQuery client
|
|
table_name : Nama table
|
|
layer : 'bronze'/'raw', 'silver'/'staging', 'gold'/'dw'
|
|
"""
|
|
table_id = get_table_id(table_name, layer)
|
|
client.delete_table(table_id, not_found_ok=True)
|
|
print(f" Dropped [{layer.upper()}] table: {table_name}")
|
|
|
|
# SCHEMA DEFINITIONS — STAGING LAYER (Silver)
|
|
|
|
def get_staging_schema() -> list:
|
|
"""
|
|
Schema untuk staging_integrated table (STAGING/Silver layer).
|
|
|
|
Staging table adalah area integrasi data dari semua sumber (FAO, WB, UNICEF)
|
|
sebelum di-load ke DW layer sebagai Dim & Fact tables.
|
|
|
|
Returns:
|
|
list: List of SchemaField objects
|
|
"""
|
|
return [
|
|
bigquery.SchemaField("source", "STRING", mode="REQUIRED"),
|
|
bigquery.SchemaField("indicator_original", "STRING", mode="REQUIRED"),
|
|
bigquery.SchemaField("indicator_standardized", "STRING", mode="REQUIRED"),
|
|
bigquery.SchemaField("country", "STRING", mode="REQUIRED"),
|
|
bigquery.SchemaField("year", "INTEGER", mode="NULLABLE"),
|
|
bigquery.SchemaField("year_range", "STRING", mode="NULLABLE"),
|
|
bigquery.SchemaField("value", "FLOAT", mode="NULLABLE"),
|
|
bigquery.SchemaField("unit", "STRING", mode="NULLABLE"),
|
|
]
|
|
|
|
|
|
def get_etl_metadata_schema() -> list:
|
|
"""
|
|
Schema untuk etl_metadata table (STAGING/Silver layer).
|
|
|
|
ETL metadata disimpan di Staging layer karena merupakan operational table
|
|
untuk reproducibility & tracking, bukan bagian Star Schema DW.
|
|
|
|
Returns:
|
|
list: List of SchemaField objects
|
|
"""
|
|
return [
|
|
bigquery.SchemaField("id", "STRING", mode="REQUIRED"),
|
|
bigquery.SchemaField("source_class", "STRING", mode="REQUIRED"),
|
|
bigquery.SchemaField("table_name", "STRING", mode="REQUIRED"),
|
|
bigquery.SchemaField("execution_timestamp", "DATETIME", mode="REQUIRED"),
|
|
bigquery.SchemaField("duration_seconds", "FLOAT", mode="NULLABLE"),
|
|
bigquery.SchemaField("rows_fetched", "INTEGER", mode="NULLABLE"),
|
|
bigquery.SchemaField("rows_transformed", "INTEGER", mode="NULLABLE"),
|
|
bigquery.SchemaField("rows_loaded", "INTEGER", mode="NULLABLE"),
|
|
bigquery.SchemaField("completeness_pct", "FLOAT", mode="NULLABLE"),
|
|
bigquery.SchemaField("config_snapshot", "STRING", mode="NULLABLE"),
|
|
bigquery.SchemaField("validation_metrics", "STRING", mode="NULLABLE"),
|
|
bigquery.SchemaField("created_at", "TIMESTAMP", mode="REQUIRED"),
|
|
bigquery.SchemaField("updated_at", "TIMESTAMP", mode="REQUIRED"),
|
|
]
|
|
|
|
# ETL METADATA — STAGING LAYER (Silver)
|
|
# FIXED: Preserve created_at dari eksekusi pertama
|
|
|
|
def save_etl_metadata(client: bigquery.Client, metadata: dict):
|
|
"""
|
|
Save ETL metadata ke etl_metadata table (STAGING/Silver layer).
|
|
|
|
Logic created_at vs updated_at:
|
|
created_at : diambil dari record PERTAMA untuk table_name yang sama
|
|
(preserved across runs — untuk reproducibility)
|
|
updated_at : selalu diperbarui ke waktu eksekusi sekarang
|
|
|
|
Args:
|
|
client : BigQuery client
|
|
metadata : Dict berisi informasi eksekusi ETL:
|
|
table_name (required)
|
|
source_class (required)
|
|
execution_timestamp
|
|
duration_seconds
|
|
rows_fetched
|
|
rows_transformed
|
|
rows_loaded
|
|
completeness_pct
|
|
config_snapshot (JSON string)
|
|
validation_metrics (JSON string)
|
|
"""
|
|
table_name = metadata.get('table_name', 'unknown')
|
|
table_id = get_table_id('etl_metadata', layer='audit')
|
|
|
|
# Buat table jika belum ada
|
|
if not table_exists(client, 'etl_metadata', layer='audit'):
|
|
schema = get_etl_metadata_schema()
|
|
table = bigquery.Table(table_id, schema=schema)
|
|
client.create_table(table)
|
|
print(f" [AUDIT] Created table: etl_metadata")
|
|
|
|
# Ambil created_at pertama untuk table ini (preserve across runs)
|
|
check_query = f"""
|
|
SELECT MIN(created_at) AS first_created_at
|
|
FROM `{table_id}`
|
|
WHERE table_name = @table_name
|
|
"""
|
|
job_config_q = bigquery.QueryJobConfig(
|
|
query_parameters=[
|
|
bigquery.ScalarQueryParameter("table_name", "STRING", table_name)
|
|
]
|
|
)
|
|
|
|
try:
|
|
rows = list(client.query(check_query, job_config=job_config_q).result())
|
|
is_first_run = True
|
|
if rows and rows[0]['first_created_at'] is not None:
|
|
created_at = rows[0]['first_created_at']
|
|
is_first_run = False
|
|
else:
|
|
created_at = datetime.now()
|
|
except Exception:
|
|
created_at = datetime.now()
|
|
is_first_run = True
|
|
|
|
current_time = datetime.now()
|
|
|
|
# Generate unique ID
|
|
import hashlib
|
|
record_id = hashlib.md5(
|
|
f"{metadata.get('source_class')}_{table_name}_{current_time.isoformat()}".encode()
|
|
).hexdigest()
|
|
|
|
meta_df = pd.DataFrame([{
|
|
'id' : record_id,
|
|
'source_class' : metadata.get('source_class', 'unknown'),
|
|
'table_name' : table_name,
|
|
'execution_timestamp': metadata.get('execution_timestamp', current_time),
|
|
'duration_seconds' : float(metadata.get('duration_seconds', 0)),
|
|
'rows_fetched' : int(metadata.get('rows_fetched', 0)),
|
|
'rows_transformed' : int(metadata.get('rows_transformed', 0)),
|
|
'rows_loaded' : int(metadata.get('rows_loaded', 0)),
|
|
'completeness_pct' : float(metadata.get('completeness_pct', 0)),
|
|
'config_snapshot' : metadata.get('config_snapshot', '{}'),
|
|
'validation_metrics' : metadata.get('validation_metrics', '{}'),
|
|
'created_at' : created_at, # PRESERVED dari run pertama
|
|
'updated_at' : current_time # SELALU waktu sekarang
|
|
}])
|
|
|
|
# Hapus timezone untuk BigQuery
|
|
for col in ['execution_timestamp', 'created_at', 'updated_at']:
|
|
meta_df[col] = pd.to_datetime(meta_df[col]).dt.tz_localize(None)
|
|
|
|
# APPEND ke STAGING layer (Silver)
|
|
job_config = bigquery.LoadJobConfig(write_disposition="WRITE_APPEND")
|
|
job = client.load_table_from_dataframe(meta_df, table_id, job_config=job_config)
|
|
job.result()
|
|
|
|
if is_first_run:
|
|
print(f"etl_metadata — first run | created_at : {created_at}")
|
|
else:
|
|
print(f"etl_metadata — preserved | created_at : {created_at}")
|
|
print(f"etl_metadata — updated_at : {current_time}")
|
|
|
|
# INITIALIZE
|
|
|
|
logger = setup_logging()
|
|
client = get_bigquery_client()
|
|
|
|
print("BigQuery Helpers Loaded — Kimball DW Architecture")
|
|
print(f"Project : {CONFIG['bigquery']['project_id']}")
|
|
print(f"Raw (Bronze) : {CONFIG['bigquery']['dataset_bronze']}")
|
|
print(f"Staging (Silver) : {CONFIG['bigquery']['dataset_silver']}")
|
|
print(f"DW (Gold) : {CONFIG['bigquery']['dataset_gold']}") |