raw and staging data

This commit is contained in:
Debby
2026-03-12 14:57:30 +07:00
parent 847a6a9859
commit 0235dfbc75
5 changed files with 30 additions and 219 deletions

View File

@@ -21,6 +21,7 @@ Kimball ETL Flow:
"""
import os
import json
from pathlib import Path
from google.cloud import bigquery
from google.oauth2 import service_account
@@ -88,25 +89,6 @@ KIMBALL_LAYER_MAP = {
"dw" : "gold",
}
# SETUP BIGQUERY CLIENT
def get_bigquery_client() -> bigquery.Client:
"""
Create BigQuery client dengan service account credentials
Returns:
bigquery.Client: Authenticated BigQuery client
"""
credentials = service_account.Credentials.from_service_account_file(
CREDENTIALS_PATH,
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
return bigquery.Client(
credentials=credentials,
project=PROJECT_ID,
location=LOCATION
)
# MATCHING CONFIGURATION
CONFIG = {
@@ -166,7 +148,6 @@ for directory in [EXPORTS_DIR, LOGS_DIR]:
# HELPER FUNCTIONS
def get_table_id(table_name: str, layer: str = "bronze") -> str:
# Resolve Kimball alias ke layer name
resolved = KIMBALL_LAYER_MAP.get(layer.lower(), layer.lower())
dataset = LAYER_DATASET_MAP.get(resolved, DATASET_BRONZE)
@@ -174,17 +155,6 @@ def get_table_id(table_name: str, layer: str = "bronze") -> str:
def table_exists(client: bigquery.Client, table_name: str, layer: str = "bronze") -> bool:
"""
Check apakah table ada di BigQuery
Args:
client : BigQuery client
table_name : Nama table
layer : Layer — 'bronze'/'raw', 'silver'/'staging', 'gold'/'dw'
Returns:
bool: True jika table ada
"""
try:
client.get_table(get_table_id(table_name, layer))
return True
@@ -193,14 +163,6 @@ def table_exists(client: bigquery.Client, table_name: str, layer: str = "bronze"
def delete_table(client: bigquery.Client, table_name: str, layer: str = "bronze"):
"""
Delete table jika ada
Args:
client : BigQuery client
table_name : Nama table
layer : Layer — 'bronze'/'raw', 'silver'/'staging', 'gold'/'dw'
"""
table_id = get_table_id(table_name, layer)
try:
client.delete_table(table_id, not_found_ok=True)
@@ -210,13 +172,6 @@ def delete_table(client: bigquery.Client, table_name: str, layer: str = "bronze"
def create_dataset_if_not_exists(client: bigquery.Client, dataset_id: str):
"""
Create dataset jika belum ada
Args:
client : BigQuery client
dataset_id : Dataset ID string
"""
full_id = f"{PROJECT_ID}.{dataset_id}"
try:
client.get_dataset(full_id)
@@ -229,7 +184,6 @@ def create_dataset_if_not_exists(client: bigquery.Client, dataset_id: str):
def create_all_datasets(client: bigquery.Client):
"""Create semua 3 dataset (Raw/Staging/DW) jika belum ada"""
print("Setting up BigQuery Datasets (Kimball DW)...")
for layer, dataset_id in LAYER_DATASET_MAP.items():
create_dataset_if_not_exists(client, dataset_id)
@@ -238,21 +192,14 @@ def create_all_datasets(client: bigquery.Client):
# VERIFICATION
def verify_setup() -> bool:
"""
Verify BigQuery setup untuk semua 3 layer (Raw / Staging / DW)
Checks:
1. Credentials file exists
2. Koneksi ke BigQuery berhasil
3. Semua dataset ada atau berhasil dibuat
"""
print("=" * 60)
print("BIGQUERY SETUP VERIFICATION")
print("Kimball DW Architecture")
print("=" * 60)
# 1. Credentials
if not os.path.exists(CREDENTIALS_PATH):
credentials_json = os.environ.get("GOOGLE_CREDENTIALS_JSON")
if not credentials_json and not os.path.exists(CREDENTIALS_PATH):
print(f"Credentials not found : {CREDENTIALS_PATH}")
return False
print(f"✓ Credentials found")
@@ -284,15 +231,16 @@ def verify_setup() -> bool:
print("=" * 60)
return True
# INITIALIZE ON IMPORT
if __name__ == "__main__":
verify_setup()
else:
print("BigQuery Config Loaded — Kimball DW Architecture")
print(f" Project : {PROJECT_ID}")
print(f" Project : {PROJECT_ID}")
print(f" Raw (Bronze) : {DATASET_BRONZE}")
print(f" Staging (Silver) : {DATASET_SILVER}")
print(f" DW (Gold) : {DATASET_GOLD}")
print(f" Audit : {DATASET_AUDIT}")
print(f" Location : {LOCATION}")
print(f" Location : {LOCATION}")