code last done

This commit is contained in:
Debby
2026-04-02 19:58:05 +07:00
parent 6030268924
commit 47ea9c0492
4 changed files with 708 additions and 1374 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -1,44 +1,14 @@
"""
BIGQUERY ANALYTICAL LAYER - DATA FILTERING
fact_asean_food_security_selected disimpan di fs_asean_gold (layer='gold')
FIXED: analytical_food_security disimpan di fs_asean_gold (layer='gold')
Filtering Order:
1. Load data (single years only)
2. Determine year boundaries (2013 - auto-detected end year, baseline=2023 per syarat dosen)
2. Determine year boundaries (2013 - auto-detected end year)
3. Filter complete indicators PER COUNTRY (auto-detect start year, no gaps)
4. Filter countries with ALL pillars (FIXED SET)
5. Filter indicators with consistent presence across FIXED countries
→ TIDAK menghapus baris year < max_start_year
→ Semua baris tetap ada; label framework ditentukan di Step 6
6. Assign framework (MDGs/SDGs) per indicator PER ROW
→ Indikator TIDAK di SDG_ONLY_KEYWORDS → 'MDGs' selalu
→ Indikator DI SDG_ONLY_KEYWORDS + year >= SDG_TRANSITION_YEAR → 'SDGs'
→ Indikator DI SDG_ONLY_KEYWORDS + year < SDG_TRANSITION_YEAR → 'MDGs'
→ SDG_TRANSITION_YEAR = 2015 (HARDCODE — tanggal resmi SDGs berlaku)
7. Verify no gaps (dari actual_start_year per indikator, bukan start_year global)
8. Calculate norm_value_1_100 per indicator (min-max, direction-aware, global)
*** PERBAIKAN: normalisasi dilakukan SEKALI untuk seluruh data (semua tahun),
bukan per-framework, agar nilai dari era MDGs dan SDGs berada di
skala yang sama dan dapat dibandingkan secara adil. ***
9. Calculate YoY per indicator per country
10. Analyze indicator availability by year
11. Save analytical table
FRAMEWORK LOGIC:
- SDG_TRANSITION_YEAR = 2015 (HARDCODE, bukan auto-detect dari data)
- Semua SDG-only indicators menggunakan SDG_TRANSITION_YEAR yang SAMA
- SDG-only + year < SDG_TRANSITION_YEAR → 'MDGs' (data tetap ada, tidak dihapus)
- SDG-only + year >= SDG_TRANSITION_YEAR → 'SDGs'
- Non-SDG-only indicators → 'MDGs' selalu (di semua tahun)
NORMALISASI (PERBAIKAN):
- norm_value_1_100 dihitung SATU KALI per indikator menggunakan seluruh data
(semua tahun, semua negara) sebagai referensi min-max.
- Ini memastikan nilai 60 di era MDGs dan nilai 60 di era SDGs memiliki
makna yang SAMA (posisi relatif yang sama dalam distribusi global).
- Tidak ada rescaling ulang per-framework di layer analitik ini.
- Rescaling per-framework (jika diperlukan untuk visualisasi) sebaiknya
dilakukan di layer agregasi (analysis_layer) dengan flag eksplisit.
6. Save analytical table (value only, normalisasi & direction handled downstream)
"""
import pandas as pd
@@ -64,82 +34,21 @@ from scripts.bigquery_helpers import (
from google.cloud import bigquery
# =============================================================================
# SDG-ONLY INDICATOR KEYWORDS
# =============================================================================
SDG_ONLY_KEYWORDS = frozenset([
# TARGET 2.1.1 — Undernourishment
"prevalence of undernourishment (percent) (3-year average)",
"number of people undernourished (million) (3-year average)",
# TARGET 2.1.2 — Food Insecurity (FIES)
"prevalence of severe food insecurity in the total population (percent) (3-year average)",
"prevalence of severe food insecurity in the male adult population (percent) (3-year average)",
"prevalence of severe food insecurity in the female adult population (percent) (3-year average)",
"prevalence of moderate or severe food insecurity in the total population (percent) (3-year average)",
"prevalence of moderate or severe food insecurity in the male adult population (percent) (3-year average)",
"prevalence of moderate or severe food insecurity in the female adult population (percent) (3-year average)",
"number of severely food insecure people (million) (3-year average)",
"number of severely food insecure male adults (million) (3-year average)",
"number of severely food insecure female adults (million) (3-year average)",
"number of moderately or severely food insecure people (million) (3-year average)",
"number of moderately or severely food insecure male adults (million) (3-year average)",
"number of moderately or severely food insecure female adults (million) (3-year average)",
# TARGET 2.2.1 — Stunting
"percentage of children under 5 years of age who are stunted (modelled estimates) (percent)",
"number of children under 5 years of age who are stunted (modeled estimates) (million)",
# TARGET 2.2.2 — Wasting
"percentage of children under 5 years affected by wasting (percent)",
"number of children under 5 years affected by wasting (million)",
# TARGET 2.2.2 — Overweight (children)
"percentage of children under 5 years of age who are overweight (modelled estimates) (percent)",
"number of children under 5 years of age who are overweight (modeled estimates) (million)",
# TARGET 2.2.3 — Anaemia
"prevalence of anemia among women of reproductive age (15-49 years) (percent)",
"number of women of reproductive age (15-49 years) affected by anemia (million)",
])
# =============================================================================
# SDG TRANSITION YEAR — HARDCODE
# =============================================================================
SDG_TRANSITION_YEAR = 2015
# =============================================================================
# THRESHOLD KONDISI (fixed absolute, skala 1-100)
# =============================================================================
THRESHOLD_BAD = 40.0
THRESHOLD_GOOD = 60.0
def assign_condition(norm_value_1_100: float) -> str:
if pd.isna(norm_value_1_100):
return None
if norm_value_1_100 > THRESHOLD_GOOD:
return 'good'
if norm_value_1_100 < THRESHOLD_BAD:
return 'bad'
return 'moderate'
# =============================================================================
# ANALYTICAL LAYER CLASS
# =============================================================================
class AnalyticalLayerLoader:
"""
Analytical Layer Loader for BigQuery
Analytical Layer Loader for BigQuery - CORRECTED VERSION v4
PERBAIKAN NORMALISASI:
- norm_value_1_100 dihitung SEKALI per indikator dari seluruh data
(semua tahun, semua negara). Tidak ada rescaling ulang per-framework.
- Ini memastikan komparabilitas lintas era MDGs dan SDGs.
Key Logic:
1. Complete per country (no gaps from start_year to end_year)
2. Filter countries with all pillars
3. Ensure indicators have consistent country count across all years
4. Save raw value only (normalisasi & direction handled downstream)
Output: analytical_food_security -> DW layer (Gold) -> fs_asean_gold
"""
def __init__(self, client: bigquery.Client):
@@ -153,14 +62,11 @@ class AnalyticalLayerLoader:
self.df_pillar = None
self.selected_country_ids = None
self.indicator_max_start_map = {}
self.start_year = 2013
self.end_year = None
self.baseline_year = 2023
self.sdg_transition_year = SDG_TRANSITION_YEAR
self.pipeline_metadata = {
'source_class' : self.__class__.__name__,
'start_time' : None,
@@ -175,10 +81,6 @@ class AnalyticalLayerLoader:
self.pipeline_start = None
self.pipeline_end = None
# ------------------------------------------------------------------
# STEP 1: LOAD SOURCE DATA
# ------------------------------------------------------------------
def load_source_data(self):
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 1: LOADING SOURCE DATA from fs_asean_gold")
@@ -209,17 +111,14 @@ class AnalyticalLayerLoader:
"""
self.logger.info("Loading fact table with dimensions...")
self.df_clean = self.client.query(query).result().to_dataframe(
create_bqstorage_client=False
)
self.df_clean = self.client.query(query).result().to_dataframe(create_bqstorage_client=False)
self.logger.info(f" Loaded: {len(self.df_clean):,} rows")
if 'is_year_range' in self.df_clean.columns:
yr = self.df_clean['is_year_range'].value_counts()
self.logger.info(
f" Single years: {yr.get(False, 0):,} | "
f"Year ranges: {yr.get(True, 0):,}"
)
self.logger.info(f" Breakdown:")
self.logger.info(f" Single years (is_year_range=False): {yr.get(False, 0):,}")
self.logger.info(f" Year ranges (is_year_range=True): {yr.get(True, 0):,}")
self.df_indicator = read_from_bigquery(self.client, 'dim_indicator', layer='gold')
self.df_country = read_from_bigquery(self.client, 'dim_country', layer='gold')
@@ -236,25 +135,20 @@ class AnalyticalLayerLoader:
self.logger.error(f"Error loading source data: {e}")
raise
# ------------------------------------------------------------------
# STEP 2: DETERMINE YEAR BOUNDARIES
# ------------------------------------------------------------------
def determine_year_boundaries(self):
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 2: DETERMINE YEAR BOUNDARIES")
self.logger.info("=" * 80)
df_baseline = self.df_clean[self.df_clean['year'] == self.baseline_year]
baseline_indicator_count = df_baseline['indicator_id'].nunique()
df_2023 = self.df_clean[self.df_clean['year'] == self.baseline_year]
baseline_indicator_count = df_2023['indicator_id'].nunique()
self.logger.info(f"\n Baseline year (hardcode, syarat dosen): {self.baseline_year}")
self.logger.info(f" Baseline indicator count: {baseline_indicator_count}")
self.logger.info(f"\nBaseline Year: {self.baseline_year}")
self.logger.info(f"Baseline Indicator Count: {baseline_indicator_count}")
years_sorted = sorted(self.df_clean['year'].unique(), reverse=True)
selected_end_year = None
self.logger.info(f"\n Scanning end_year (>= {self.baseline_year}):")
for year in years_sorted:
if year >= self.baseline_year:
df_year = self.df_clean[self.df_clean['year'] == year]
@@ -266,9 +160,9 @@ class AnalyticalLayerLoader:
if selected_end_year is None:
selected_end_year = self.baseline_year
self.logger.warning(f" [!] Fallback to baseline: {selected_end_year}")
self.logger.warning(f" [!] No year found, using baseline: {selected_end_year}")
else:
self.logger.info(f"\n [OK] Selected end year: {selected_end_year}")
self.logger.info(f"\n [OK] Selected End Year: {selected_end_year}")
self.end_year = selected_end_year
original_count = len(self.df_clean)
@@ -278,15 +172,11 @@ class AnalyticalLayerLoader:
(self.df_clean['year'] <= self.end_year)
].copy()
self.logger.info(f"\n Filtering {self.start_year}-{self.end_year}:")
self.logger.info(f"\nFiltering {self.start_year}-{self.end_year}:")
self.logger.info(f" Rows before: {original_count:,}")
self.logger.info(f" Rows after : {len(self.df_clean):,}")
self.logger.info(f" Rows after: {len(self.df_clean):,}")
return self.df_clean
# ------------------------------------------------------------------
# STEP 3: FILTER COMPLETE INDICATORS PER COUNTRY
# ------------------------------------------------------------------
def filter_complete_indicators_per_country(self):
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 3: FILTER COMPLETE INDICATORS PER COUNTRY (NO GAPS)")
@@ -339,14 +229,9 @@ class AnalyticalLayerLoader:
self.logger.info(f" [-] Removed: {len(removed_combinations):,}")
df_valid = pd.DataFrame(valid_combinations)
df_valid['key'] = (
df_valid['country_id'].astype(str) + '_' +
df_valid['indicator_id'].astype(str)
)
self.df_clean['key'] = (
self.df_clean['country_id'].astype(str) + '_' +
self.df_clean['indicator_id'].astype(str)
)
df_valid['key'] = df_valid['country_id'].astype(str) + '_' + df_valid['indicator_id'].astype(str)
self.df_clean['key'] = (self.df_clean['country_id'].astype(str) + '_' +
self.df_clean['indicator_id'].astype(str))
original_count = len(self.df_clean)
self.df_clean = self.df_clean[self.df_clean['key'].isin(df_valid['key'])].copy()
@@ -358,10 +243,6 @@ class AnalyticalLayerLoader:
self.logger.info(f" Indicators: {self.df_clean['indicator_id'].nunique()}")
return self.df_clean
# ------------------------------------------------------------------
# STEP 4: SELECT COUNTRIES WITH ALL PILLARS
# ------------------------------------------------------------------
def select_countries_with_all_pillars(self):
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 4: SELECT COUNTRIES WITH ALL PILLARS (FIXED SET)")
@@ -384,26 +265,18 @@ class AnalyticalLayerLoader:
f"{row['pillar_count']}/{total_pillars} pillars"
)
selected_countries = country_pillar_count[
country_pillar_count['pillar_count'] == total_pillars
]
selected_countries = country_pillar_count[country_pillar_count['pillar_count'] == total_pillars]
self.selected_country_ids = selected_countries['country_id'].tolist()
self.logger.info(f"\n FIXED SET: {len(self.selected_country_ids)} countries")
original_count = len(self.df_clean)
self.df_clean = self.df_clean[
self.df_clean['country_id'].isin(self.selected_country_ids)
].copy()
self.df_clean = self.df_clean[self.df_clean['country_id'].isin(self.selected_country_ids)].copy()
self.logger.info(f" Rows before: {original_count:,}")
self.logger.info(f" Rows after: {len(self.df_clean):,}")
return self.df_clean
# ------------------------------------------------------------------
# STEP 5: FILTER INDICATORS CONSISTENT ACROSS FIXED COUNTRIES
# ------------------------------------------------------------------
def filter_indicators_consistent_across_fixed_countries(self):
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 5: FILTER INDICATORS WITH CONSISTENT PRESENCE")
@@ -412,9 +285,7 @@ class AnalyticalLayerLoader:
indicator_country_start = self.df_clean.groupby([
'indicator_id', 'indicator_name', 'country_id'
])['year'].min().reset_index()
indicator_country_start.columns = [
'indicator_id', 'indicator_name', 'country_id', 'start_year'
]
indicator_country_start.columns = ['indicator_id', 'indicator_name', 'country_id', 'start_year']
indicator_max_start = indicator_country_start.groupby([
'indicator_id', 'indicator_name'
@@ -463,379 +334,47 @@ class AnalyticalLayerLoader:
raise ValueError("No valid indicators found after filtering!")
original_count = len(self.df_clean)
self.df_clean = self.df_clean[
self.df_clean['indicator_id'].isin(valid_indicators)
].copy()
self.df_clean = self.df_clean[self.df_clean['indicator_id'].isin(valid_indicators)].copy()
self.indicator_max_start_map = (
indicator_max_start[indicator_max_start['indicator_id'].isin(valid_indicators)]
.set_index('indicator_id')['max_start_year']
.to_dict()
self.df_clean = self.df_clean.merge(
indicator_max_start[['indicator_id', 'max_start_year']], on='indicator_id', how='left'
)
self.df_clean = self.df_clean[self.df_clean['year'] >= self.df_clean['max_start_year']].copy()
self.df_clean = self.df_clean.drop('max_start_year', axis=1)
self.logger.info(f"\n Rows before : {original_count:,}")
self.logger.info(f" Rows after : {len(self.df_clean):,}")
self.logger.info(f" Countries : {self.df_clean['country_id'].nunique()}")
self.logger.info(f" Indicators : {self.df_clean['indicator_id'].nunique()}")
self.logger.info(f" Pillars : {self.df_clean['pillar_id'].nunique()}")
self.logger.info(
f"\n [NOTE] Baris year < max_start_year TETAP ADA di data. "
f"Label framework akan ditentukan di Step 6."
)
self.logger.info(f"\n Rows before: {original_count:,}")
self.logger.info(f" Rows after: {len(self.df_clean):,}")
self.logger.info(f" Countries: {self.df_clean['country_id'].nunique()}")
self.logger.info(f" Indicators: {self.df_clean['indicator_id'].nunique()}")
self.logger.info(f" Pillars: {self.df_clean['pillar_id'].nunique()}")
return self.df_clean
# ------------------------------------------------------------------
# STEP 6: ASSIGN FRAMEWORK PER ROW
# ------------------------------------------------------------------
def assign_framework(self):
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 6: ASSIGN FRAMEWORK PER ROW")
self.logger.info("=" * 80)
self.logger.info(f"\n SDG_TRANSITION_YEAR : {self.sdg_transition_year} (HARDCODE)")
self.logger.info(f" Alasan : SDGs resmi berlaku 1 Januari 2015")
self.logger.info(f" Bukan auto-detect : data FIES/anaemia ada sejak 2013,")
self.logger.info(f" tapi tahun 2013-2014 harus tetap MDGs")
indicator_info = (
self.df_clean[['indicator_id', 'indicator_name']]
.drop_duplicates()
.copy()
)
indicator_info['is_sdg_only'] = (
indicator_info['indicator_name']
.str.lower()
.str.strip()
.isin(SDG_ONLY_KEYWORDS)
)
sdg_only_ids = set(
indicator_info.loc[indicator_info['is_sdg_only'], 'indicator_id']
)
non_sdg_ids = set(
indicator_info.loc[~indicator_info['is_sdg_only'], 'indicator_id']
)
self.logger.info(f"\n SDG-only indicators ({len(sdg_only_ids)}):")
for _, row in indicator_info[indicator_info['is_sdg_only']].iterrows():
actual_start = self.indicator_max_start_map.get(row['indicator_id'], '?')
self.logger.info(
f" [SDG-only] id={int(row['indicator_id'])} "
f"actual_start={actual_start} | {row['indicator_name']}"
)
self.logger.info(f"\n Non-SDG-only indicators ({len(non_sdg_ids)}): → MDGs selalu")
if not sdg_only_ids:
raise ValueError(
"Tidak ada indikator SDG-only (FIES/anaemia) yang lolos filter. "
"Pastikan nama indikator di SDG_ONLY_KEYWORDS cocok dengan data BigQuery."
)
self.df_clean['_is_sdg_only'] = self.df_clean['indicator_id'].isin(sdg_only_ids)
self.df_clean['framework'] = np.where(
self.df_clean['_is_sdg_only'] &
(self.df_clean['year'] >= self.sdg_transition_year),
'SDGs',
'MDGs'
)
self.df_clean = self.df_clean.drop(columns=['_is_sdg_only'])
self.logger.info(f"\n Logika assign framework (PER BARIS):")
self.logger.info(f" {''*72}")
self.logger.info(f" Indikator TIDAK di SDG_ONLY_KEYWORDS → 'MDGs' di semua tahun")
self.logger.info(f" Indikator DI SDG_ONLY_KEYWORDS:")
self.logger.info(f" year < {self.sdg_transition_year}'MDGs' (data tetap ada, tidak dihapus)")
self.logger.info(f" year >= {self.sdg_transition_year}'SDGs'")
self.logger.info(f" {''*72}")
self.logger.info(f"\n Verifikasi framework per indikator:")
self.logger.info(f" {''*115}")
self.logger.info(
f" {'ID':<5} {'Indicator Name':<52} {'Data From':<11} "
f"{'MDGs rows':<11} {'SDGs rows':<11} {'Note'}"
)
self.logger.info(f" {''*115}")
for ind_id, grp in self.df_clean.groupby('indicator_id'):
ind_name = grp['indicator_name'].iloc[0]
mdgs_rows = (grp['framework'] == 'MDGs').sum()
sdgs_rows = (grp['framework'] == 'SDGs').sum()
is_sdg_only = ind_id in sdg_only_ids
data_from = int(grp['year'].min())
if is_sdg_only:
mdgs_yrs = sorted(grp[grp['framework'] == 'MDGs']['year'].unique())
sdgs_yrs = sorted(grp[grp['framework'] == 'SDGs']['year'].unique())
yr_range_mdgs = f"{min(mdgs_yrs)}-{max(mdgs_yrs)}" if mdgs_yrs else "-"
yr_range_sdgs = f"{min(sdgs_yrs)}-{max(sdgs_yrs)}" if sdgs_yrs else "-"
note = f"MDGs:{yr_range_mdgs} | SDGs:{yr_range_sdgs}"
else:
note = "MDGs always"
self.logger.info(
f" {int(ind_id):<5} {ind_name[:50]:<52} {data_from:<11} "
f"{mdgs_rows:<11} {sdgs_rows:<11} {note}"
)
fw_summary = self.df_clean['framework'].value_counts()
self.logger.info(f"\n Ringkasan rows: " + " | ".join(
f"{fw}: {cnt:,}" for fw, cnt in fw_summary.items()
))
end_year_df = self.df_clean[self.df_clean['year'] == self.end_year]
fw_ind_summary = end_year_df.groupby('framework')['indicator_id'].nunique()
self.logger.info(f" Indicators di year={self.end_year}: " + " | ".join(
f"{fw}: {cnt}" for fw, cnt in fw_ind_summary.items()
))
self.logger.info(
f"\n [OK] 'framework' ditambahkan — "
f"MDGs: {(self.df_clean['framework'] == 'MDGs').sum():,} rows | "
f"SDGs: {(self.df_clean['framework'] == 'SDGs').sum():,} rows"
)
return self.df_clean
# ------------------------------------------------------------------
# STEP 7: VERIFY NO GAPS
# ------------------------------------------------------------------
def verify_no_gaps(self):
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 7: VERIFY NO GAPS")
self.logger.info("STEP 6: VERIFY NO GAPS")
self.logger.info("=" * 80)
expected_countries = len(self.selected_country_ids)
all_good = True
bad_rows = []
for ind_id, grp in self.df_clean.groupby('indicator_id'):
actual_start = self.indicator_max_start_map.get(ind_id)
if actual_start is None:
continue
expected_years = list(range(int(actual_start), self.end_year + 1))
for year in expected_years:
country_count = grp[grp['year'] == year]['country_id'].nunique()
if country_count != expected_countries:
all_good = False
bad_rows.append({
'indicator_id' : int(ind_id),
'year' : int(year),
'country_count': int(country_count),
})
verification = self.df_clean.groupby(['indicator_id', 'year'])['country_id'].nunique().reset_index()
verification.columns = ['indicator_id', 'year', 'country_count']
all_good = (verification['country_count'] == expected_countries).all()
if all_good:
self.logger.info(
f" VERIFICATION PASSED — all combinations from actual_start_year "
f"have {expected_countries} countries"
)
self.logger.info(f" VERIFICATION PASSED — all combinations have {expected_countries} countries")
else:
for row in bad_rows[:10]:
bad = verification[verification['country_count'] != expected_countries]
for _, row in bad.head(10).iterrows():
self.logger.error(
f" Indicator {row['indicator_id']}, Year {row['year']}: "
f"{row['country_count']} countries (expected {expected_countries})"
f" Indicator {int(row['indicator_id'])}, Year {int(row['year'])}: "
f"{int(row['country_count'])} countries (expected {expected_countries})"
)
raise ValueError("Gap verification failed!")
return True
# ------------------------------------------------------------------
# STEP 8: CALCULATE NORM_VALUE_1_100 PER INDICATOR
# ------------------------------------------------------------------
# PERBAIKAN:
# Normalisasi dilakukan SEKALI per indikator dari SELURUH DATA
# (semua tahun 2013end_year, semua negara, tanpa memisahkan framework).
#
# Alasan:
# - Sebelumnya, rescaling per-framework di analysis_layer menyebabkan
# nilai 1-100 era MDGs dan SDGs memiliki referensi yang berbeda,
# sehingga tidak dapat dibandingkan secara adil.
# - Dengan satu normalisasi global per indikator, nilai 60 di era MDGs
# dan nilai 60 di era SDGs berarti hal yang sama: posisi relatif yang
# sama dalam distribusi historis indikator tersebut.
# - Jika SDGs memang era yang lebih buruk secara substantif, itu akan
# tercermin sebagai nilai norm yang memang lebih rendah — bukan artefak
# dari rescaling ulang.
# ------------------------------------------------------------------
def calculate_norm_value(self):
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 8: CALCULATE NORM_VALUE_1_100 PER INDICATOR (GLOBAL, SEKALI)")
self.logger.info("=" * 80)
self.logger.info(
"\n [PERBAIKAN] Normalisasi dilakukan SEKALI per indikator dari seluruh data."
"\n Tidak ada rescaling ulang per-framework."
"\n Ini memastikan komparabilitas lintas era MDGs dan SDGs."
)
DIRECTION_INVERT = frozenset({
"negative", "lower_better", "lower_is_better", "inverse", "neg",
})
df = self.df_clean.copy()
norm_parts = []
indicators = df.groupby(['indicator_id', 'indicator_name', 'direction'])
self.logger.info(
f"\n {'ID':<5} {'Direction':<15} {'Invert':<8} "
f"{'Min':>10} {'Max':>10} {'Indicator Name'}"
)
self.logger.info(f" {'-'*90}")
for (ind_id, ind_name, direction), grp in indicators:
grp = grp.copy()
do_invert = str(direction).lower().strip() in DIRECTION_INVERT
valid_mask = grp['value'].notna()
n_valid = valid_mask.sum()
if n_valid < 2:
grp['norm_value_1_100'] = np.nan
norm_parts.append(grp)
self.logger.warning(
f" {int(ind_id):<5} {direction:<15} {'N/A':<8} "
f"{'N/A':>10} {'N/A':>10} {ind_name[:45]} [SKIPPED: n_valid={n_valid}]"
)
continue
raw = grp.loc[valid_mask, 'value'].values
v_min = raw.min()
v_max = raw.max()
normed = np.full(len(grp), np.nan)
if v_min == v_max:
# Semua nilai sama → assign tengah skala
normed[valid_mask.values] = 50.5
else:
scaled = (raw - v_min) / (v_max - v_min)
if do_invert:
scaled = 1.0 - scaled
normed[valid_mask.values] = 1.0 + scaled * 99.0
grp['norm_value_1_100'] = normed
self.logger.info(
f" {int(ind_id):<5} {direction:<15} {'YES' if do_invert else 'no':<8} "
f"{v_min:>10.3f} {v_max:>10.3f} {ind_name[:45]}"
)
norm_parts.append(grp)
self.df_clean = pd.concat(norm_parts, ignore_index=True)
valid_norm = self.df_clean['norm_value_1_100'].notna().sum()
null_norm = self.df_clean['norm_value_1_100'].isna().sum()
self.logger.info(f"\n norm_value_1_100 — valid: {valid_norm:,} | null: {null_norm:,}")
self.logger.info(
f" Range aktual: "
f"{self.df_clean['norm_value_1_100'].min():.2f} - "
f"{self.df_clean['norm_value_1_100'].max():.2f}"
)
# ----------------------------------------------------------------
# VALIDASI KOMPARABILITAS: Cek apakah ada gap sistematis antar era
# Ini adalah sinyal diagnostik — bukan error.
# Gap besar (>15 poin) setelah perbaikan = fenomena nyata, bukan artefak.
# ----------------------------------------------------------------
self.logger.info(f"\n [DIAGNOSTIK KOMPARABILITAS] Rata-rata norm per framework per tahun:")
self.logger.info(f" {''*55}")
fw_year_mean = (
self.df_clean
.groupby(['framework', 'year'])['norm_value_1_100']
.mean()
.reset_index()
.sort_values(['framework', 'year'])
)
for fw, grp_fw in fw_year_mean.groupby('framework'):
means = grp_fw['norm_value_1_100'].values
years = grp_fw['year'].values
self.logger.info(f"\n Framework: {fw}")
for yr, m in zip(years, means):
bar = '' * int(m / 5)
self.logger.info(f" {int(yr)} : {m:6.2f} {bar}")
# Bandingkan rata-rata MDGs vs SDGs (hanya tahun di mana keduanya ada)
mdgs_mean_total = self.df_clean[self.df_clean['framework'] == 'MDGs']['norm_value_1_100'].mean()
sdgs_mean_total = self.df_clean[self.df_clean['framework'] == 'SDGs']['norm_value_1_100'].mean()
gap = mdgs_mean_total - sdgs_mean_total
self.logger.info(
f"\n Rata-rata keseluruhan:"
f"\n MDGs : {mdgs_mean_total:.2f}"
f"\n SDGs : {sdgs_mean_total:.2f}"
f"\n Gap : {gap:.2f} poin"
)
if abs(gap) > 15:
self.logger.info(
f"\n [INFO] Gap {gap:.2f} poin antara MDGs dan SDGs."
f"\n Setelah perbaikan normalisasi (satu referensi global),"
f"\n gap ini mencerminkan perbedaan SUBSTANTIF, bukan artefak teknis."
f"\n Indikator SDGs memang mengukur dimensi deprivasi yang lebih dalam"
f"\n (FIES, stunting, wasting, anaemia) dibanding indikator MDGs."
)
else:
self.logger.info(
f"\n [OK] Gap {gap:.2f} poin — dalam batas wajar, tidak ada bias sistematis."
)
# Distribusi kondisi
self.df_clean['_condition_preview'] = (
self.df_clean['norm_value_1_100'].apply(assign_condition)
)
cond_dist = self.df_clean['_condition_preview'].value_counts()
self.logger.info(
f"\n Distribusi kondisi "
f"(threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}):"
)
for cond, cnt in cond_dist.items():
self.logger.info(f" {cond}: {cnt:,} rows")
self.df_clean = self.df_clean.drop(columns=['_condition_preview'])
self.logger.info(f"\n [OK] Kolom 'norm_value_1_100' ditambahkan ke df_clean")
return self.df_clean
# ------------------------------------------------------------------
# STEP 9: CALCULATE YOY
# ------------------------------------------------------------------
def calculate_yoy(self):
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 9: CALCULATE YEAR-OVER-YEAR (YoY) PER INDICATOR PER COUNTRY")
self.logger.info("=" * 80)
df = self.df_clean.sort_values(['country_id', 'indicator_id', 'year']).copy()
df['value_prev'] = df.groupby(['country_id', 'indicator_id'])['value'].shift(1)
df['yoy_change'] = df['value'] - df['value_prev']
df['yoy_pct'] = np.where(
df['value_prev'].notna() & (df['value_prev'] != 0),
(df['yoy_change'] / df['value_prev'].abs()) * 100,
np.nan
)
df = df.drop(columns=['value_prev'])
total_rows = len(df)
valid_yoy = df['yoy_pct'].notna().sum()
null_yoy = df['yoy_pct'].isna().sum()
self.logger.info(f" Total rows : {total_rows:,}")
self.logger.info(f" YoY calculated : {valid_yoy:,}")
self.logger.info(f" YoY NULL (base yr): {null_yoy:,}")
self.df_clean = df
self.logger.info(f" [OK] Kolom 'yoy_change', 'yoy_pct' ditambahkan")
return self.df_clean
# ------------------------------------------------------------------
# STEP 10: ANALYZE INDICATOR AVAILABILITY BY YEAR
# ------------------------------------------------------------------
def analyze_indicator_availability_by_year(self):
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 10: ANALYZE INDICATOR AVAILABILITY BY YEAR")
self.logger.info("STEP 7: ANALYZE INDICATOR AVAILABILITY BY YEAR")
self.logger.info("=" * 80)
year_stats = self.df_clean.groupby('year').agg({
@@ -861,139 +400,57 @@ class AnalyticalLayerLoader:
'indicator_id', 'indicator_name', 'pillar_name', 'direction',
'start_year', 'end_year', 'country_count'
]
fw_at_end = (
self.df_clean[self.df_clean['year'] == self.end_year]
.groupby('indicator_id')['framework']
.first()
.reset_index()
)
indicator_details = indicator_details.merge(fw_at_end, on='indicator_id', how='left')
indicator_details['framework'] = indicator_details['framework'].fillna('MDGs')
indicator_details['year_range'] = (
indicator_details['start_year'].astype(int).astype(str) + '-' +
indicator_details['end_year'].astype(int).astype(str)
)
indicator_details = indicator_details.sort_values(
['framework', 'pillar_name', 'start_year', 'indicator_name']
)
indicator_details = indicator_details.sort_values(['pillar_name', 'start_year', 'indicator_name'])
self.logger.info(f"\nTotal Indicators: {len(indicator_details)}")
self.logger.info(f"Framework breakdown (at end_year={self.end_year}):")
for fw, count in indicator_details.groupby('framework').size().items():
self.logger.info(f" {fw}: {count} indicators")
for pillar, count in indicator_details.groupby('pillar_name').size().items():
self.logger.info(f" {pillar}: {count} indicators")
self.logger.info(f"\n{'-'*110}")
self.logger.info(
f"{'ID':<5} {'Indicator Name':<55} {'Pillar':<15} "
f"{'Framework':<10} {'Years':<12} {'Dir':<8} {'Countries'}"
)
self.logger.info(f"{'-'*110}")
self.logger.info(f"\n{'-'*100}")
self.logger.info(f"{'ID':<5} {'Indicator Name':<55} {'Pillar':<15} {'Years':<12} {'Dir':<8} {'Countries'}")
self.logger.info(f"{'-'*100}")
for _, row in indicator_details.iterrows():
direction = 'higher+' if row['direction'] == 'higher_better' else 'lower-'
self.logger.info(
f"{int(row['indicator_id']):<5} {row['indicator_name'][:52]:<55} "
f"{row['pillar_name'][:13]:<15} {row['framework']:<10} "
f"{row['year_range']:<12} {direction:<8} {int(row['country_count'])}"
f"{row['pillar_name'][:13]:<15} {row['year_range']:<12} "
f"{direction:<8} {int(row['country_count'])}"
)
return year_stats
# ------------------------------------------------------------------
# STEP 11: SAVE ANALYTICAL TABLE
# ------------------------------------------------------------------
def save_analytical_table(self):
table_name = 'fact_asean_food_security_selected'
table_name = 'analytical_food_security'
self.logger.info("\n" + "=" * 80)
self.logger.info(f"STEP 11: SAVE TO [DW/Gold] {table_name} -> fs_asean_gold")
self.logger.info(f"STEP 8: SAVE TO [DW/Gold] {table_name} -> fs_asean_gold")
self.logger.info("=" * 80)
try:
if 'framework' not in self.df_clean.columns:
raise ValueError("Kolom 'framework' tidak ada. Pastikan Step 6 sudah dijalankan.")
if 'norm_value_1_100' not in self.df_clean.columns:
raise ValueError("Kolom 'norm_value_1_100' tidak ada. Pastikan Step 8 sudah dijalankan.")
if 'yoy_change' not in self.df_clean.columns:
raise ValueError("Kolom 'yoy_change' tidak ada. Pastikan Step 9 sudah dijalankan.")
analytical_df = self.df_clean[[
'country_id',
'country_name',
'indicator_id',
'indicator_name',
'direction',
'framework',
'pillar_id',
'pillar_name',
'time_id',
'year',
'value',
'norm_value_1_100',
'yoy_change',
'yoy_pct',
'country_id', 'indicator_id', 'pillar_id', 'time_id', 'value'
]].copy()
analytical_df = analytical_df.sort_values(
['year', 'country_name', 'pillar_name', 'indicator_name']
['time_id', 'country_id', 'indicator_id']
).reset_index(drop=True)
analytical_df['country_id'] = analytical_df['country_id'].astype(int)
analytical_df['country_name'] = analytical_df['country_name'].astype(str)
analytical_df['indicator_id'] = analytical_df['indicator_id'].astype(int)
analytical_df['indicator_name'] = analytical_df['indicator_name'].astype(str)
analytical_df['direction'] = analytical_df['direction'].astype(str)
analytical_df['framework'] = analytical_df['framework'].astype(str)
analytical_df['pillar_id'] = analytical_df['pillar_id'].astype(int)
analytical_df['pillar_name'] = analytical_df['pillar_name'].astype(str)
analytical_df['time_id'] = analytical_df['time_id'].astype(int)
analytical_df['year'] = analytical_df['year'].astype(int)
analytical_df['value'] = analytical_df['value'].astype(float)
analytical_df['norm_value_1_100'] = analytical_df['norm_value_1_100'].astype(float)
analytical_df['yoy_change'] = analytical_df['yoy_change'].astype(float)
analytical_df['yoy_pct'] = analytical_df['yoy_pct'].astype(float)
self.logger.info(f" Total rows: {len(analytical_df):,}")
fw_dist_rows = analytical_df['framework'].value_counts()
self.logger.info(f" Framework distribution (rows):")
for fw, cnt in fw_dist_rows.items():
self.logger.info(f" {fw}: {cnt:,} rows")
fw_dist_ind = (
analytical_df[analytical_df['year'] == self.end_year]
.drop_duplicates('indicator_id')['framework']
.value_counts()
)
self.logger.info(
f" Framework distribution (indicators at year={self.end_year}):"
)
for fw, cnt in fw_dist_ind.items():
self.logger.info(f" {fw}: {cnt} indicators")
self.logger.info(
f" norm_value_1_100 range: "
f"{analytical_df['norm_value_1_100'].min():.2f} - "
f"{analytical_df['norm_value_1_100'].max():.2f}"
)
schema = [
bigquery.SchemaField("country_id", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("country_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("indicator_id", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("indicator_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("direction", "STRING", mode="REQUIRED"),
bigquery.SchemaField("framework", "STRING", mode="REQUIRED"),
bigquery.SchemaField("pillar_id", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("pillar_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("time_id", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("year", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("value", "FLOAT", mode="REQUIRED"),
bigquery.SchemaField("norm_value_1_100", "FLOAT", mode="NULLABLE"),
bigquery.SchemaField("yoy_change", "FLOAT", mode="NULLABLE"),
bigquery.SchemaField("yoy_pct", "FLOAT", mode="NULLABLE"),
]
rows_loaded = load_to_bigquery(
@@ -1016,63 +473,31 @@ class AnalyticalLayerLoader:
'config_snapshot' : json.dumps({
'start_year' : self.start_year,
'end_year' : self.end_year,
'baseline_year' : self.baseline_year,
'sdg_transition_year' : self.sdg_transition_year,
'sdg_transition_source' : 'HARDCODE — SDGs resmi berlaku 1 Jan 2015',
'fixed_countries' : len(self.selected_country_ids),
'norm_scale' : (
'1-100 per indicator global minmax direction-aware. '
'SATU normalisasi untuk seluruh data tanpa rescaling per-framework. '
'Komparabilitas lintas era MDGs/SDGs terjamin.'
),
'framework_logic' : (
f'SDG_TRANSITION_YEAR={SDG_TRANSITION_YEAR} (HARDCODE); '
'SDG-only + year >= SDG_TRANSITION_YEAR → SDGs; '
'SDG-only + year < SDG_TRANSITION_YEAR → MDGs (data tetap ada); '
'non-SDG-only → MDGs selalu'
),
'sdg_only_keywords_count': len(SDG_ONLY_KEYWORDS),
'condition_thresholds' : {
'bad' : f'< {THRESHOLD_BAD}',
'moderate': f'{THRESHOLD_BAD}-{THRESHOLD_GOOD}',
'good' : f'> {THRESHOLD_GOOD}',
},
'fixed_countries': len(self.selected_country_ids),
'no_gaps' : True,
'layer' : 'gold'
}),
'validation_metrics' : json.dumps({
'fixed_countries' : len(self.selected_country_ids),
'total_indicators' : int(self.df_clean['indicator_id'].nunique()),
'sdg_transition_year': self.sdg_transition_year,
'framework_dist_rows': fw_dist_rows.to_dict(),
'total_indicators': int(self.df_clean['indicator_id'].nunique())
})
}
save_etl_metadata(self.client, metadata)
self.logger.info(f" [OK] {table_name}: {rows_loaded:,} rows -> fs_asean_gold")
self.logger.info(f" {table_name}: {rows_loaded:,} rows → [DW/Gold] fs_asean_gold")
self.logger.info(f" Metadata → [AUDIT] etl_metadata")
return rows_loaded
except Exception as e:
self.logger.error(f"Error saving: {e}")
raise
# ------------------------------------------------------------------
# RUN
# ------------------------------------------------------------------
def run(self):
self.pipeline_start = datetime.now()
self.pipeline_metadata['start_time'] = self.pipeline_start
self.logger.info("\n" + "=" * 80)
self.logger.info("Output: fact_asean_food_security_selected -> fs_asean_gold")
self.logger.info("Kolom baru: norm_value_1_100 (min-max 1-100, direction-aware)")
self.logger.info(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
self.logger.info(
f"Framework: SDG_TRANSITION_YEAR={SDG_TRANSITION_YEAR} (HARDCODE). "
"SDG-only + year >= 2015 → SDGs; sebelumnya MDGs. Non-SDG-only → MDGs selalu."
)
self.logger.info(
"NORMALISASI: SATU referensi global per indikator — tidak ada rescaling per-framework."
)
self.logger.info("Output: analytical_food_security fs_asean_gold")
self.logger.info("=" * 80)
self.load_source_data()
@@ -1080,10 +505,7 @@ class AnalyticalLayerLoader:
self.filter_complete_indicators_per_country()
self.select_countries_with_all_pillars()
self.filter_indicators_consistent_across_fixed_countries()
self.assign_framework()
self.verify_no_gaps()
self.calculate_norm_value()
self.calculate_yoy()
self.analyze_indicator_availability_by_year()
self.save_analytical_table()
@@ -1095,10 +517,9 @@ class AnalyticalLayerLoader:
self.logger.info("=" * 80)
self.logger.info(f" Duration : {duration:.2f}s")
self.logger.info(f" Year Range : {self.start_year}-{self.end_year}")
self.logger.info(f" SDG Transition Year: {self.sdg_transition_year} (HARDCODE)")
self.logger.info(f" Countries : {len(self.selected_country_ids)}")
self.logger.info(f" Indicators : {self.df_clean['indicator_id'].nunique()}")
self.logger.info(f" Rows Loaded : {self.pipeline_metadata['rows_loaded']:,}")
self.logger.info(f" Rows Loaded: {self.pipeline_metadata['rows_loaded']:,}")
# =============================================================================
@@ -1106,6 +527,10 @@ class AnalyticalLayerLoader:
# =============================================================================
def run_analytical_layer():
"""
Airflow task: Build analytical_food_security dari fact_food_security + dims.
Dipanggil setelah dimensional_model_to_gold selesai.
"""
from scripts.bigquery_config import get_bigquery_client
client = get_bigquery_client()
loader = AnalyticalLayerLoader(client)
@@ -1119,14 +544,7 @@ def run_analytical_layer():
if __name__ == "__main__":
print("=" * 80)
print("BIGQUERY ANALYTICAL LAYER - DATA FILTERING")
print("Output: fact_asean_food_security_selected -> fs_asean_gold")
print(f"Norm: min-max 1-100 per indicator, direction-aware, GLOBAL (satu referensi)")
print(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
print(
f"Framework: SDG_TRANSITION_YEAR={SDG_TRANSITION_YEAR} (HARDCODE). "
"SDG-only + year >= 2015 → SDGs; sebelumnya MDGs. Non-SDG-only → MDGs selalu."
)
print("Output: analytical_food_security → fs_asean_gold")
print("=" * 80)
logger = setup_logging()
@@ -1136,6 +554,4 @@ if __name__ == "__main__":
print("\n" + "=" * 80)
print("[OK] COMPLETED")
print(f" SDG Transition Year : {loader.sdg_transition_year} (HARDCODE)")
print(f" Rows Loaded : {loader.pipeline_metadata['rows_loaded']:,}")
print("=" * 80)

View File

@@ -40,7 +40,7 @@ def load_staging_data(client: bigquery.Client) -> pd.DataFrame:
"""Load data dari staging_integrated (STAGING/Silver layer)."""
print("\nLoading data from staging_integrated (fs_asean_silver)...")
df_staging = read_from_bigquery(client, 'staging_integrated', layer='silver')
print(f" Loaded : {len(df_staging):,} rows")
print(f" Loaded : {len(df_staging):,} rows")
print(f" Columns : {len(df_staging.columns)}")
print(f" Sources : {df_staging['source'].nunique()}")
print(f" Indicators : {df_staging['indicator_standardized'].nunique()}")
@@ -53,6 +53,7 @@ def load_staging_data(client: bigquery.Client) -> pd.DataFrame:
# COLUMN CONSTRAINT HELPERS
# =============================================================================
# Schema constraints — semua varchar max lengths
COLUMN_CONSTRAINTS = {
'source' : 20,
'indicator_original' : 255,
@@ -61,7 +62,7 @@ COLUMN_CONSTRAINTS = {
'year_range' : 20,
'unit' : 20,
'pillar' : 20,
'direction' : 15,
'direction' : 15, # 'higher_better'=13, 'lower_better'=12
}
@@ -100,11 +101,11 @@ def apply_column_constraints(df: pd.DataFrame) -> pd.DataFrame:
)
if truncation_report:
print("\n Column Truncations Applied:")
print("\n Column Truncations Applied:")
for column, info in truncation_report.items():
print(f" - {column}: {info['count']} values truncated to {info['max_length']} chars")
else:
print("\n No truncations needed — all values within constraints")
print("\n No truncations needed — all values within constraints")
return df_constrained
@@ -176,16 +177,16 @@ def standardize_country_names_asean(df: pd.DataFrame, country_column: str = 'cou
def assign_pillar(indicator_name: str) -> str:
"""
Assign pillar berdasarkan keyword indikator.
Return values: 'Availability', 'Access', 'Utilization', 'Stability', 'Supporting'
All <= 20 chars (varchar(20) constraint).
Return values: 'Availability', 'Access', 'Utilization', 'Stability', 'Other'
All 20 chars (varchar(20) constraint).
"""
if pd.isna(indicator_name):
return 'Supporting'
return 'Other'
ind = str(indicator_name).lower()
for kw in ['requirement', 'coefficient', 'losses', 'fat supply']:
if kw in ind:
return 'Supporting'
return 'Other'
if any(kw in ind for kw in [
'adequacy', 'protein supply', 'supply of protein',
@@ -209,13 +210,12 @@ def assign_pillar(indicator_name: str) -> str:
if any(kw in ind for kw in [
'wasting', 'wasted', 'stunted', 'overweight', 'obese', 'obesity',
'anemia', 'anaemia', 'birthweight', 'breastfeeding', 'drinking water',
'sanitation', 'children under 5', 'newborns with low',
'women of reproductive'
'anemia', 'birthweight', 'breastfeeding', 'drinking water', 'sanitation',
'children under 5', 'newborns with low', 'women of reproductive'
]):
return 'Utilization'
return 'Supporting'
return 'Other'
# =============================================================================
@@ -226,15 +226,17 @@ def assign_direction(indicator_name: str) -> str:
"""
Assign direction berdasarkan indikator.
Return values: 'higher_better' (13 chars) atau 'lower_better' (12 chars)
Both <= 15 chars (varchar(15) constraint).
Both 15 chars (varchar(15) constraint).
"""
if pd.isna(indicator_name):
return 'higher_better'
ind = str(indicator_name).lower()
# Spesifik lower_better
if 'share of dietary energy supply derived from cereals' in ind:
return 'lower_better'
# Higher_better exceptions — cek sebelum lower_better keywords
for kw in [
'exclusive breastfeeding',
'dietary energy supply',
@@ -246,6 +248,7 @@ def assign_direction(indicator_name: str) -> str:
if kw in ind:
return 'higher_better'
# Lower_better — masalah yang harus diminimalkan
for kw in [
'prevalence of undernourishment',
'prevalence of severe food insecurity',
@@ -256,7 +259,6 @@ def assign_direction(indicator_name: str) -> str:
'prevalence of overweight',
'prevalence of obesity',
'prevalence of anemia',
'prevalence of anaemia',
'prevalence of low birthweight',
'number of people undernourished',
'number of severely food insecure',
@@ -281,9 +283,6 @@ def assign_direction(indicator_name: str) -> str:
'coefficient of variation',
'incidence of caloric losses',
'food losses',
'indicator of food price anomalies',
'proportion of local breeds classified as being at risk',
'agricultural export subsidies',
]:
if kw in ind:
return 'lower_better'
@@ -300,18 +299,19 @@ class CleanedDataLoader:
Loader untuk cleaned integrated data ke STAGING layer (Silver).
Kimball context:
Input : staging_integrated -> STAGING (Silver) — fs_asean_silver
Output : cleaned_integrated -> STAGING (Silver) — fs_asean_silver
Audit : etl_logs, etl_metadata -> AUDIT — fs_asean_audit
Input : staging_integrated STAGING (Silver) — fs_asean_silver
Output : cleaned_integrated STAGING (Silver) — fs_asean_silver
Audit : etl_logs, etl_metadata AUDIT — fs_asean_audit
Pipeline steps:
1. Standardize country names (ASEAN)
2. Remove missing values
3. Remove duplicates
4. Add pillar & direction classification
5. Apply column constraints
6. Load ke BigQuery
7. Log ke Audit layer
4. Add pillar classification
5. Add direction classification
6. Apply column constraints
7. Load ke BigQuery
8. Log ke Audit layer
"""
SCHEMA = [
@@ -355,7 +355,7 @@ class CleanedDataLoader:
def _step_standardize_countries(self, df: pd.DataFrame) -> pd.DataFrame:
print("\n [Step 1/5] Standardize country names...")
df, report = standardize_country_names_asean(df, country_column='country')
print(f" ASEAN countries mapped : {report['countries_mapped']}")
print(f" ASEAN countries mapped : {report['countries_mapped']}")
unique_countries = sorted(df['country'].unique())
print(f" Countries ({len(unique_countries)}) : {', '.join(unique_countries)}")
log_update(self.client, 'STAGING', 'staging_integrated',
@@ -377,9 +377,7 @@ class CleanedDataLoader:
def _step_remove_duplicates(self, df: pd.DataFrame) -> pd.DataFrame:
print("\n [Step 3/5] Remove duplicates...")
exact_dups = df.duplicated().sum()
data_dups = df.duplicated(
subset=['indicator_standardized', 'country', 'year', 'value']
).sum()
data_dups = df.duplicated(subset=['indicator_standardized', 'country', 'year', 'value']).sum()
print(f" Exact duplicates : {exact_dups:,}")
print(f" Data duplicates : {data_dups:,}")
rows_before = len(df)
@@ -393,21 +391,19 @@ class CleanedDataLoader:
def _step_add_classifications(self, df: pd.DataFrame) -> pd.DataFrame:
print("\n [Step 4/5] Add pillar & direction classification...")
df = df.copy()
df['pillar'] = df['indicator_standardized'].apply(assign_pillar)
df['direction'] = df['indicator_standardized'].apply(assign_direction)
pillar_counts = df['pillar'].value_counts()
print(f" Pillar distribution:")
print(f" Pillar distribution:")
for pillar, count in pillar_counts.items():
print(f" - {pillar}: {count:,}")
direction_counts = df['direction'].value_counts()
print(f" Direction distribution:")
print(f" Direction distribution:")
for direction, count in direction_counts.items():
pct = count / len(df) * 100
print(f" - {direction}: {count:,} ({pct:.1f}%)")
return df
def _step_apply_constraints(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -442,6 +438,7 @@ class CleanedDataLoader:
if 'country' in df.columns:
validation['unique_countries'] = int(df['country'].nunique())
# Column length check
column_length_check = {}
for col, max_len in COLUMN_CONSTRAINTS.items():
if col in df.columns:
@@ -460,7 +457,7 @@ class CleanedDataLoader:
def run(self, df: pd.DataFrame) -> int:
"""
Execute full cleaning pipeline -> load ke STAGING (Silver).
Execute full cleaning pipeline load ke STAGING (Silver).
Returns:
int: Rows loaded
@@ -472,6 +469,7 @@ class CleanedDataLoader:
print(" ERROR: DataFrame is empty, nothing to process.")
return 0
# Pipeline steps
df = self._step_standardize_countries(df)
df = self._step_remove_missing(df)
df = self._step_remove_duplicates(df)
@@ -480,6 +478,7 @@ class CleanedDataLoader:
self.metadata['rows_transformed'] = len(df)
# Validate
validation = self.validate_data(df)
self.metadata['validation_metrics'] = validation
@@ -488,12 +487,13 @@ class CleanedDataLoader:
for info in validation.get('column_length_check', {}).values()
)
if not all_within_limits:
print("\n WARNING: Some columns still exceed length constraints!")
print("\n WARNING: Some columns still exceed length constraints!")
for col, info in validation['column_length_check'].items():
if not info['within_limit']:
print(f" - {col}: {info['max_actual_length']} > {info['max_length_constraint']}")
print(f"\n Loading to [STAGING/Silver] {self.table_name} -> fs_asean_silver...")
# Load ke Silver
print(f"\n Loading to [STAGING/Silver] {self.table_name} → fs_asean_silver...")
rows_loaded = load_to_bigquery(
self.client, df, self.table_name,
layer='silver',
@@ -502,8 +502,10 @@ class CleanedDataLoader:
)
self.metadata['rows_loaded'] = rows_loaded
# Audit logs
log_update(self.client, 'STAGING', self.table_name, 'full_refresh', rows_loaded)
# ETL metadata
self.metadata['end_time'] = datetime.now()
self.metadata['duration_seconds'] = (
self.metadata['end_time'] - self.metadata['start_time']
@@ -514,31 +516,33 @@ class CleanedDataLoader:
self.metadata['validation_metrics'] = json.dumps(validation)
save_etl_metadata(self.client, self.metadata)
print(f"\n Cleaned Integration completed: {rows_loaded:,} rows")
# Summary
print(f"\n ✓ Cleaned Integration completed: {rows_loaded:,} rows")
print(f" Duration : {self.metadata['duration_seconds']:.2f}s")
print(f" Completeness : {validation['completeness_pct']:.2f}%")
if 'year_range' in validation:
yr = validation['year_range']
if yr['min'] and yr['max']:
print(f" Year range : {yr['min']}-{yr['max']}")
print(f" Year range : {yr['min']}{yr['max']}")
print(f" Indicators : {validation.get('unique_indicators', '-')}")
print(f" Countries : {validation.get('unique_countries', '-')}")
print(f"\n Schema Validation:")
for col, info in validation.get('column_length_check', {}).items():
status = "OK" if info['within_limit'] else "FAIL"
print(f" [{status}] {col}: {info['max_actual_length']}/{info['max_length_constraint']}")
print(f"\n Metadata -> [AUDIT] etl_metadata")
status = "" if info['within_limit'] else ""
print(f" {status} {col}: {info['max_actual_length']}/{info['max_length_constraint']}")
print(f"\n Metadata [AUDIT] etl_metadata")
return rows_loaded
# =============================================================================
# AIRFLOW TASK FUNCTIONS
# AIRFLOW TASK FUNCTIONS ← sama polanya dengan raw layer
# =============================================================================
def run_cleaned_integration():
"""
Airflow task: Load cleaned_integrated dari staging_integrated.
Dipanggil oleh DAG setelah task staging_integration_to_silver selesai.
"""
from scripts.bigquery_config import get_bigquery_client
@@ -557,21 +561,21 @@ if __name__ == "__main__":
print("=" * 60)
print("BIGQUERY CLEANED LAYER ETL")
print("Kimball DW Architecture")
print(" Input : STAGING (Silver) -> staging_integrated")
print(" Output : STAGING (Silver) -> cleaned_integrated")
print(" Audit : AUDIT -> etl_logs, etl_metadata")
print(" Input : STAGING (Silver) staging_integrated")
print(" Output : STAGING (Silver) cleaned_integrated")
print(" Audit : AUDIT etl_logs, etl_metadata")
print("=" * 60)
logger = setup_logging()
client = get_bigquery_client()
df_staging = load_staging_data(client)
print("\n[1/1] Cleaned Integration -> STAGING (Silver)...")
print("\n[1/1] Cleaned Integration STAGING (Silver)...")
loader = CleanedDataLoader(client, load_mode='full_refresh')
final_count = loader.run(df_staging)
print("\n" + "=" * 60)
print("[OK] CLEANED LAYER ETL COMPLETED")
print(f" STAGING (Silver) : cleaned_integrated ({final_count:,} rows)")
print(f" AUDIT : etl_logs, etl_metadata")
print(" CLEANED LAYER ETL COMPLETED")
print(f" 🥈 STAGING (Silver) : cleaned_integrated ({final_count:,} rows)")
print(f" 📋 AUDIT : etl_logs, etl_metadata")
print("=" * 60)

View File

@@ -46,9 +46,9 @@ class DimensionalModelLoader:
Loader untuk dimensional model ke DW layer (Gold) — fs_asean_gold.
Kimball context:
Input : cleaned_integrated -> STAGING (Silver) — fs_asean_silver
Output : dim_* + fact_* -> DW (Gold) — fs_asean_gold
Audit : etl_logs, etl_metadata -> AUDIT — fs_asean_audit
Input : cleaned_integrated STAGING (Silver) — fs_asean_silver
Output : dim_* + fact_* DW (Gold) — fs_asean_gold
Audit : etl_logs, etl_metadata AUDIT — fs_asean_audit
Pipeline steps:
1. Load dim_country
@@ -117,7 +117,7 @@ class DimensionalModelLoader:
"""
try:
self.client.query(query).result()
self.logger.info(f" [OK] FK: {table_name}.{fk_column} -> {ref_table}.{ref_column}")
self.logger.info(f" [OK] FK: {table_name}.{fk_column} {ref_table}.{ref_column}")
except Exception as e:
if "already exists" in str(e).lower():
self.logger.info(f" [INFO] FK already exists: {constraint_name}")
@@ -145,7 +145,7 @@ class DimensionalModelLoader:
}
try:
save_etl_metadata(self.client, metadata)
self.logger.info(f" Metadata -> [AUDIT] etl_metadata")
self.logger.info(f" Metadata [AUDIT] etl_metadata")
except Exception as e:
self.logger.warning(f" [WARN] Could not save metadata for {table_name}: {e}")
@@ -156,7 +156,7 @@ class DimensionalModelLoader:
def load_dim_time(self):
table_name = 'dim_time'
self.load_metadata[table_name]['start_time'] = datetime.now()
self.logger.info("Loading dim_time -> [DW/Gold] fs_asean_gold...")
self.logger.info("Loading dim_time [DW/Gold] fs_asean_gold...")
try:
if 'year_range' in self.df_clean.columns:
@@ -229,7 +229,7 @@ class DimensionalModelLoader:
)
log_update(self.client, 'DW', table_name, 'full_load', rows_loaded)
self._save_table_metadata(table_name)
self.logger.info(f" dim_time: {rows_loaded} rows\n")
self.logger.info(f" dim_time: {rows_loaded} rows\n")
return rows_loaded
except Exception as e:
@@ -240,7 +240,7 @@ class DimensionalModelLoader:
def load_dim_country(self):
table_name = 'dim_country'
self.load_metadata[table_name]['start_time'] = datetime.now()
self.logger.info("Loading dim_country -> [DW/Gold] fs_asean_gold...")
self.logger.info("Loading dim_country [DW/Gold] fs_asean_gold...")
try:
dim_country = self.df_clean[['country']].drop_duplicates().copy()
@@ -270,9 +270,7 @@ class DimensionalModelLoader:
lambda x: region_mapping.get(x, ('Unknown', 'Unknown'))[1])
dim_country['iso_code'] = dim_country['country_name'].map(iso_mapping)
dim_country_final = dim_country[
['country_name', 'region', 'subregion', 'iso_code']
].copy()
dim_country_final = dim_country[['country_name', 'region', 'subregion', 'iso_code']].copy()
dim_country_final = dim_country_final.reset_index(drop=True)
dim_country_final.insert(0, 'country_id', range(1, len(dim_country_final) + 1))
@@ -295,7 +293,7 @@ class DimensionalModelLoader:
)
log_update(self.client, 'DW', table_name, 'full_load', rows_loaded)
self._save_table_metadata(table_name)
self.logger.info(f" dim_country: {rows_loaded} rows\n")
self.logger.info(f" dim_country: {rows_loaded} rows\n")
return rows_loaded
except Exception as e:
@@ -304,19 +302,9 @@ class DimensionalModelLoader:
raise
def load_dim_indicator(self):
"""
Load dim_indicator ke Gold layer.
Kolom yang dimuat:
indicator_id — surrogate key
indicator_name — nama standar indikator
indicator_category — kategori (Health & Nutrition, dll.)
unit — satuan ukuran
direction — higher_better / lower_better
"""
table_name = 'dim_indicator'
self.load_metadata[table_name]['start_time'] = datetime.now()
self.logger.info("Loading dim_indicator -> [DW/Gold] fs_asean_gold...")
self.logger.info("Loading dim_indicator [DW/Gold] fs_asean_gold...")
try:
has_direction = 'direction' in self.df_clean.columns
@@ -326,7 +314,6 @@ class DimensionalModelLoader:
dim_indicator = self.df_clean[['indicator_standardized']].drop_duplicates().copy()
dim_indicator.columns = ['indicator_name']
# Unit
if has_unit:
unit_map = self.df_clean[['indicator_standardized', 'unit']].drop_duplicates()
unit_map.columns = ['indicator_name', 'unit']
@@ -334,7 +321,6 @@ class DimensionalModelLoader:
else:
dim_indicator['unit'] = None
# Direction
if has_direction:
dir_map = self.df_clean[['indicator_standardized', 'direction']].drop_duplicates()
dir_map.columns = ['indicator_name', 'direction']
@@ -344,43 +330,30 @@ class DimensionalModelLoader:
dim_indicator['direction'] = 'higher_better'
self.logger.warning(" [WARN] direction not found, default: higher_better")
# Indicator category
if has_category:
cat_map = self.df_clean[
['indicator_standardized', 'indicator_category']
].drop_duplicates()
cat_map = self.df_clean[['indicator_standardized', 'indicator_category']].drop_duplicates()
cat_map.columns = ['indicator_name', 'indicator_category']
dim_indicator = dim_indicator.merge(cat_map, on='indicator_name', how='left')
else:
def categorize_indicator(name):
n = str(name).lower()
if any(w in n for w in [
'undernourishment', 'malnutrition', 'stunting',
'wasting', 'anemia', 'anaemia', 'food security',
'food insecure', 'hunger'
]):
if any(w in n for w in ['undernourishment', 'malnutrition', 'stunting',
'wasting', 'anemia', 'food security', 'food insecure', 'hunger']):
return 'Health & Nutrition'
elif any(w in n for w in [
'production', 'yield', 'cereal', 'crop',
'import dependency', 'share of dietary'
]):
elif any(w in n for w in ['production', 'yield', 'cereal', 'crop',
'import dependency', 'share of dietary']):
return 'Agricultural Production'
elif any(w in n for w in ['import', 'export', 'trade']):
return 'Trade'
elif any(w in n for w in ['gdp', 'income', 'economic']):
return 'Economic'
elif any(w in n for w in [
'water', 'sanitation', 'infrastructure', 'rail'
]):
elif any(w in n for w in ['water', 'sanitation', 'infrastructure', 'rail']):
return 'Infrastructure'
else:
return 'Supporting'
dim_indicator['indicator_category'] = dim_indicator['indicator_name'].apply(
categorize_indicator
)
return 'Other'
dim_indicator['indicator_category'] = dim_indicator['indicator_name'].apply(categorize_indicator)
dim_indicator = dim_indicator.drop_duplicates(subset=['indicator_name'], keep='first')
dim_indicator_final = dim_indicator[
['indicator_name', 'indicator_category', 'unit', 'direction']
].copy()
@@ -401,22 +374,17 @@ class DimensionalModelLoader:
)
self._add_primary_key(table_name, 'indicator_id')
# Log distribusi
for label, col in [
('Categories', 'indicator_category'),
('Direction', 'direction'),
]:
for label, col in [('Categories', 'indicator_category'), ('Direction', 'direction')]:
self.logger.info(f" {label}:")
for val, cnt in dim_indicator_final[col].value_counts().items():
pct = cnt / len(dim_indicator_final) * 100
self.logger.info(f" - {val}: {cnt} ({pct:.1f}%)")
self.logger.info(f" - {val}: {cnt} ({cnt/len(dim_indicator_final)*100:.1f}%)")
self.load_metadata[table_name].update(
{'rows_loaded': rows_loaded, 'status': 'success', 'end_time': datetime.now()}
)
log_update(self.client, 'DW', table_name, 'full_load', rows_loaded)
self._save_table_metadata(table_name)
self.logger.info(f" dim_indicator: {rows_loaded} rows\n")
self.logger.info(f" dim_indicator: {rows_loaded} rows\n")
return rows_loaded
except Exception as e:
@@ -427,7 +395,7 @@ class DimensionalModelLoader:
def load_dim_source(self):
table_name = 'dim_source'
self.load_metadata[table_name]['start_time'] = datetime.now()
self.logger.info("Loading dim_source -> [DW/Gold] fs_asean_gold...")
self.logger.info("Loading dim_source [DW/Gold] fs_asean_gold...")
try:
source_details = {
@@ -487,7 +455,7 @@ class DimensionalModelLoader:
)
log_update(self.client, 'DW', table_name, 'full_load', rows_loaded)
self._save_table_metadata(table_name)
self.logger.info(f" dim_source: {rows_loaded} rows\n")
self.logger.info(f" dim_source: {rows_loaded} rows\n")
return rows_loaded
except Exception as e:
@@ -498,15 +466,15 @@ class DimensionalModelLoader:
def load_dim_pillar(self):
table_name = 'dim_pillar'
self.load_metadata[table_name]['start_time'] = datetime.now()
self.logger.info("Loading dim_pillar -> [DW/Gold] fs_asean_gold...")
self.logger.info("Loading dim_pillar [DW/Gold] fs_asean_gold...")
try:
pillar_codes = {
'Availability': 'AVL', 'Access' : 'ACC',
'Utilization' : 'UTL', 'Stability': 'STB', 'Supporting': 'SPT',
'Utilization' : 'UTL', 'Stability': 'STB', 'Other': 'OTH',
}
pillars_data = [
{'pillar_name': p, 'pillar_code': pillar_codes.get(p, 'SPT')}
{'pillar_name': p, 'pillar_code': pillar_codes.get(p, 'OTH')}
for p in self.df_clean['pillar'].unique()
]
@@ -533,7 +501,7 @@ class DimensionalModelLoader:
)
log_update(self.client, 'DW', table_name, 'full_load', rows_loaded)
self._save_table_metadata(table_name)
self.logger.info(f" dim_pillar: {rows_loaded} rows\n")
self.logger.info(f" dim_pillar: {rows_loaded} rows\n")
return rows_loaded
except Exception as e:
@@ -548,9 +516,10 @@ class DimensionalModelLoader:
def load_fact_food_security(self):
table_name = 'fact_food_security'
self.load_metadata[table_name]['start_time'] = datetime.now()
self.logger.info("Loading fact_food_security -> [DW/Gold] fs_asean_gold...")
self.logger.info("Loading fact_food_security [DW/Gold] fs_asean_gold...")
try:
# Load dims dari Gold untuk FK resolution
dim_country = read_from_bigquery(self.client, 'dim_country', layer='gold')
dim_indicator = read_from_bigquery(self.client, 'dim_indicator', layer='gold')
dim_time = read_from_bigquery(self.client, 'dim_time', layer='gold')
@@ -592,9 +561,9 @@ class DimensionalModelLoader:
fact_table['start_year'] = fact_table['year'].astype(int)
fact_table['end_year'] = fact_table['year'].astype(int)
# Resolve FKs
fact_table = fact_table.merge(
dim_country[['country_id', 'country_name']].rename(
columns={'country_name': 'country'}),
dim_country[['country_id', 'country_name']].rename(columns={'country_name': 'country'}),
on='country', how='left'
)
fact_table = fact_table.merge(
@@ -607,16 +576,15 @@ class DimensionalModelLoader:
on=['start_year', 'end_year'], how='left'
)
fact_table = fact_table.merge(
dim_source[['source_id', 'source_name']].rename(
columns={'source_name': 'source'}),
dim_source[['source_id', 'source_name']].rename(columns={'source_name': 'source'}),
on='source', how='left'
)
fact_table = fact_table.merge(
dim_pillar[['pillar_id', 'pillar_name']].rename(
columns={'pillar_name': 'pillar'}),
dim_pillar[['pillar_id', 'pillar_name']].rename(columns={'pillar_name': 'pillar'}),
on='pillar', how='left'
)
# Filter hanya row dengan FK lengkap
fact_table = fact_table[
fact_table['country_id'].notna() &
fact_table['indicator_id'].notna() &
@@ -653,6 +621,7 @@ class DimensionalModelLoader:
layer='gold', write_disposition="WRITE_TRUNCATE", schema=schema
)
# Add PK + FKs
self._add_primary_key(table_name, 'fact_id')
self._add_foreign_key(table_name, 'country_id', 'dim_country', 'country_id')
self._add_foreign_key(table_name, 'indicator_id', 'dim_indicator', 'indicator_id')
@@ -665,7 +634,7 @@ class DimensionalModelLoader:
)
log_update(self.client, 'DW', table_name, 'full_load', rows_loaded)
self._save_table_metadata(table_name)
self.logger.info(f" fact_food_security: {rows_loaded:,} rows\n")
self.logger.info(f" fact_food_security: {rows_loaded:,} rows\n")
return rows_loaded
except Exception as e:
@@ -748,15 +717,11 @@ class DimensionalModelLoader:
FROM `{get_table_id('dim_indicator', layer='gold')}`
GROUP BY direction ORDER BY direction
"""
df_dir = self.client.query(query_dir).result().to_dataframe(
create_bqstorage_client=False
)
df_dir = self.client.query(query_dir).result().to_dataframe(create_bqstorage_client=False)
if len(df_dir) > 0:
self.logger.info(f"\n Direction Distribution:")
for _, row in df_dir.iterrows():
self.logger.info(
f" {row['direction']:15s}: {int(row['count']):>5,} indicators"
)
self.logger.info(f" {row['direction']:15s}: {int(row['count']):>5,} indicators")
self.logger.info("\n [OK] Validation completed")
except Exception as e:
@@ -773,19 +738,22 @@ class DimensionalModelLoader:
self.pipeline_metadata['rows_fetched'] = len(self.df_clean)
self.logger.info("\n" + "=" * 60)
self.logger.info("DIMENSIONAL MODEL LOAD — DW (Gold) -> fs_asean_gold")
self.logger.info("DIMENSIONAL MODEL LOAD — DW (Gold) fs_asean_gold")
self.logger.info("=" * 60)
self.logger.info("\nLOADING DIMENSION TABLES -> fs_asean_gold")
# Dimensions
self.logger.info("\nLOADING DIMENSION TABLES → fs_asean_gold")
self.load_dim_country()
self.load_dim_indicator()
self.load_dim_time()
self.load_dim_source()
self.load_dim_pillar()
self.logger.info("\nLOADING FACT TABLE -> fs_asean_gold")
# Fact
self.logger.info("\nLOADING FACT TABLE → fs_asean_gold")
self.load_fact_food_security()
# Validate
self.validate_constraints()
self.validate_data_load()
@@ -801,9 +769,7 @@ class DimensionalModelLoader:
'execution_timestamp': self.pipeline_metadata['start_time'],
'completeness_pct' : 100.0,
'config_snapshot' : json.dumps({'load_mode': 'full_refresh', 'layer': 'gold'}),
'validation_metrics' : json.dumps(
{t: m['status'] for t, m in self.load_metadata.items()}
),
'validation_metrics': json.dumps({t: m['status'] for t, m in self.load_metadata.items()}),
'table_name' : 'dimensional_model_pipeline',
})
try:
@@ -811,6 +777,7 @@ class DimensionalModelLoader:
except Exception as e:
self.logger.warning(f" [WARN] Could not save pipeline metadata: {e}")
# Summary
self.logger.info("\n" + "=" * 60)
self.logger.info("DIMENSIONAL MODEL LOAD COMPLETED")
self.logger.info("=" * 60)
@@ -818,19 +785,20 @@ class DimensionalModelLoader:
self.logger.info(f" Duration : {duration:.2f}s")
self.logger.info(f" Tables :")
for tbl, meta in self.load_metadata.items():
icon = "OK" if meta['status'] == 'success' else "FAIL"
self.logger.info(f" [{icon}] {tbl:25s}: {meta['rows_loaded']:>10,} rows")
self.logger.info(f"\n Metadata -> [AUDIT] etl_metadata")
icon = "" if meta['status'] == 'success' else ""
self.logger.info(f" {icon} {tbl:25s}: {meta['rows_loaded']:>10,} rows")
self.logger.info(f"\n Metadata [AUDIT] etl_metadata")
self.logger.info("=" * 60)
# =============================================================================
# AIRFLOW TASK FUNCTIONS
# AIRFLOW TASK FUNCTIONS ← sama polanya dengan raw & cleaned layer
# =============================================================================
def run_dimensional_model():
"""
Airflow task: Load dimensional model dari cleaned_integrated.
Dipanggil oleh DAG setelah task cleaned_integration_to_silver selesai.
"""
from scripts.bigquery_config import get_bigquery_client
@@ -849,9 +817,9 @@ if __name__ == "__main__":
print("=" * 60)
print("BIGQUERY DIMENSIONAL MODEL LOAD")
print("Kimball DW Architecture")
print(" Input : STAGING (Silver) -> cleaned_integrated (fs_asean_silver)")
print(" Output : DW (Gold) -> dim_*, fact_* (fs_asean_gold)")
print(" Audit : AUDIT -> etl_logs, etl_metadata (fs_asean_audit)")
print(" Input : STAGING (Silver) cleaned_integrated (fs_asean_silver)")
print(" Output : DW (Gold) dim_*, fact_* (fs_asean_gold)")
print(" Audit : AUDIT etl_logs, etl_metadata (fs_asean_audit)")
print("=" * 60)
logger = setup_logging()
@@ -859,22 +827,24 @@ if __name__ == "__main__":
print("\nLoading cleaned_integrated (fs_asean_silver)...")
df_clean = read_from_bigquery(client, 'cleaned_integrated', layer='silver')
print(f" Loaded : {len(df_clean):,} rows")
print(f" Loaded : {len(df_clean):,} rows")
print(f" Columns : {len(df_clean.columns)}")
print(f" Sources : {df_clean['source'].nunique()}")
print(f" Indicators : {df_clean['indicator_standardized'].nunique()}")
print(f" Countries : {df_clean['country'].nunique()}")
print(f" Year range : {int(df_clean['year'].min())}-{int(df_clean['year'].max())}")
print(f" Year range : {int(df_clean['year'].min())}{int(df_clean['year'].max())}")
if 'direction' in df_clean.columns:
print(f" Direction : {df_clean['direction'].value_counts().to_dict()}")
else:
print(f" [WARN] direction column not found — run bigquery_cleaned_layer.py first")
print("\n[1/1] Dimensional Model Load -> DW (Gold)...")
print("\n[1/1] Dimensional Model Load DW (Gold)...")
loader = DimensionalModelLoader(client, df_clean)
loader.run()
print("\n" + "=" * 60)
print("[OK] DIMENSIONAL MODEL ETL COMPLETED")
print(" DW (Gold) : dim_country, dim_indicator, dim_time,")
print(" DIMENSIONAL MODEL ETL COMPLETED")
print(" 🥇 DW (Gold) : dim_country, dim_indicator, dim_time,")
print(" dim_source, dim_pillar, fact_food_security")
print(" AUDIT : etl_logs, etl_metadata")
print(" 📋 AUDIT : etl_logs, etl_metadata")
print("=" * 60)