code last done

This commit is contained in:
Debby
2026-04-02 19:58:05 +07:00
parent 6030268924
commit 47ea9c0492
4 changed files with 708 additions and 1374 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -1,44 +1,14 @@
""" """
BIGQUERY ANALYTICAL LAYER - DATA FILTERING BIGQUERY ANALYTICAL LAYER - DATA FILTERING
fact_asean_food_security_selected disimpan di fs_asean_gold (layer='gold') FIXED: analytical_food_security disimpan di fs_asean_gold (layer='gold')
Filtering Order: Filtering Order:
1. Load data (single years only) 1. Load data (single years only)
2. Determine year boundaries (2013 - auto-detected end year, baseline=2023 per syarat dosen) 2. Determine year boundaries (2013 - auto-detected end year)
3. Filter complete indicators PER COUNTRY (auto-detect start year, no gaps) 3. Filter complete indicators PER COUNTRY (auto-detect start year, no gaps)
4. Filter countries with ALL pillars (FIXED SET) 4. Filter countries with ALL pillars (FIXED SET)
5. Filter indicators with consistent presence across FIXED countries 5. Filter indicators with consistent presence across FIXED countries
→ TIDAK menghapus baris year < max_start_year 6. Save analytical table (value only, normalisasi & direction handled downstream)
→ Semua baris tetap ada; label framework ditentukan di Step 6
6. Assign framework (MDGs/SDGs) per indicator PER ROW
→ Indikator TIDAK di SDG_ONLY_KEYWORDS → 'MDGs' selalu
→ Indikator DI SDG_ONLY_KEYWORDS + year >= SDG_TRANSITION_YEAR → 'SDGs'
→ Indikator DI SDG_ONLY_KEYWORDS + year < SDG_TRANSITION_YEAR → 'MDGs'
→ SDG_TRANSITION_YEAR = 2015 (HARDCODE — tanggal resmi SDGs berlaku)
7. Verify no gaps (dari actual_start_year per indikator, bukan start_year global)
8. Calculate norm_value_1_100 per indicator (min-max, direction-aware, global)
*** PERBAIKAN: normalisasi dilakukan SEKALI untuk seluruh data (semua tahun),
bukan per-framework, agar nilai dari era MDGs dan SDGs berada di
skala yang sama dan dapat dibandingkan secara adil. ***
9. Calculate YoY per indicator per country
10. Analyze indicator availability by year
11. Save analytical table
FRAMEWORK LOGIC:
- SDG_TRANSITION_YEAR = 2015 (HARDCODE, bukan auto-detect dari data)
- Semua SDG-only indicators menggunakan SDG_TRANSITION_YEAR yang SAMA
- SDG-only + year < SDG_TRANSITION_YEAR → 'MDGs' (data tetap ada, tidak dihapus)
- SDG-only + year >= SDG_TRANSITION_YEAR → 'SDGs'
- Non-SDG-only indicators → 'MDGs' selalu (di semua tahun)
NORMALISASI (PERBAIKAN):
- norm_value_1_100 dihitung SATU KALI per indikator menggunakan seluruh data
(semua tahun, semua negara) sebagai referensi min-max.
- Ini memastikan nilai 60 di era MDGs dan nilai 60 di era SDGs memiliki
makna yang SAMA (posisi relatif yang sama dalam distribusi global).
- Tidak ada rescaling ulang per-framework di layer analitik ini.
- Rescaling per-framework (jika diperlukan untuk visualisasi) sebaiknya
dilakukan di layer agregasi (analysis_layer) dengan flag eksplisit.
""" """
import pandas as pd import pandas as pd
@@ -64,82 +34,21 @@ from scripts.bigquery_helpers import (
from google.cloud import bigquery from google.cloud import bigquery
# =============================================================================
# SDG-ONLY INDICATOR KEYWORDS
# =============================================================================
SDG_ONLY_KEYWORDS = frozenset([
# TARGET 2.1.1 — Undernourishment
"prevalence of undernourishment (percent) (3-year average)",
"number of people undernourished (million) (3-year average)",
# TARGET 2.1.2 — Food Insecurity (FIES)
"prevalence of severe food insecurity in the total population (percent) (3-year average)",
"prevalence of severe food insecurity in the male adult population (percent) (3-year average)",
"prevalence of severe food insecurity in the female adult population (percent) (3-year average)",
"prevalence of moderate or severe food insecurity in the total population (percent) (3-year average)",
"prevalence of moderate or severe food insecurity in the male adult population (percent) (3-year average)",
"prevalence of moderate or severe food insecurity in the female adult population (percent) (3-year average)",
"number of severely food insecure people (million) (3-year average)",
"number of severely food insecure male adults (million) (3-year average)",
"number of severely food insecure female adults (million) (3-year average)",
"number of moderately or severely food insecure people (million) (3-year average)",
"number of moderately or severely food insecure male adults (million) (3-year average)",
"number of moderately or severely food insecure female adults (million) (3-year average)",
# TARGET 2.2.1 — Stunting
"percentage of children under 5 years of age who are stunted (modelled estimates) (percent)",
"number of children under 5 years of age who are stunted (modeled estimates) (million)",
# TARGET 2.2.2 — Wasting
"percentage of children under 5 years affected by wasting (percent)",
"number of children under 5 years affected by wasting (million)",
# TARGET 2.2.2 — Overweight (children)
"percentage of children under 5 years of age who are overweight (modelled estimates) (percent)",
"number of children under 5 years of age who are overweight (modeled estimates) (million)",
# TARGET 2.2.3 — Anaemia
"prevalence of anemia among women of reproductive age (15-49 years) (percent)",
"number of women of reproductive age (15-49 years) affected by anemia (million)",
])
# =============================================================================
# SDG TRANSITION YEAR — HARDCODE
# =============================================================================
SDG_TRANSITION_YEAR = 2015
# =============================================================================
# THRESHOLD KONDISI (fixed absolute, skala 1-100)
# =============================================================================
THRESHOLD_BAD = 40.0
THRESHOLD_GOOD = 60.0
def assign_condition(norm_value_1_100: float) -> str:
if pd.isna(norm_value_1_100):
return None
if norm_value_1_100 > THRESHOLD_GOOD:
return 'good'
if norm_value_1_100 < THRESHOLD_BAD:
return 'bad'
return 'moderate'
# ============================================================================= # =============================================================================
# ANALYTICAL LAYER CLASS # ANALYTICAL LAYER CLASS
# ============================================================================= # =============================================================================
class AnalyticalLayerLoader: class AnalyticalLayerLoader:
""" """
Analytical Layer Loader for BigQuery Analytical Layer Loader for BigQuery - CORRECTED VERSION v4
PERBAIKAN NORMALISASI: Key Logic:
- norm_value_1_100 dihitung SEKALI per indikator dari seluruh data 1. Complete per country (no gaps from start_year to end_year)
(semua tahun, semua negara). Tidak ada rescaling ulang per-framework. 2. Filter countries with all pillars
- Ini memastikan komparabilitas lintas era MDGs dan SDGs. 3. Ensure indicators have consistent country count across all years
4. Save raw value only (normalisasi & direction handled downstream)
Output: analytical_food_security -> DW layer (Gold) -> fs_asean_gold
""" """
def __init__(self, client: bigquery.Client): def __init__(self, client: bigquery.Client):
@@ -153,14 +62,11 @@ class AnalyticalLayerLoader:
self.df_pillar = None self.df_pillar = None
self.selected_country_ids = None self.selected_country_ids = None
self.indicator_max_start_map = {}
self.start_year = 2013 self.start_year = 2013
self.end_year = None self.end_year = None
self.baseline_year = 2023 self.baseline_year = 2023
self.sdg_transition_year = SDG_TRANSITION_YEAR
self.pipeline_metadata = { self.pipeline_metadata = {
'source_class' : self.__class__.__name__, 'source_class' : self.__class__.__name__,
'start_time' : None, 'start_time' : None,
@@ -175,10 +81,6 @@ class AnalyticalLayerLoader:
self.pipeline_start = None self.pipeline_start = None
self.pipeline_end = None self.pipeline_end = None
# ------------------------------------------------------------------
# STEP 1: LOAD SOURCE DATA
# ------------------------------------------------------------------
def load_source_data(self): def load_source_data(self):
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 1: LOADING SOURCE DATA from fs_asean_gold") self.logger.info("STEP 1: LOADING SOURCE DATA from fs_asean_gold")
@@ -209,17 +111,14 @@ class AnalyticalLayerLoader:
""" """
self.logger.info("Loading fact table with dimensions...") self.logger.info("Loading fact table with dimensions...")
self.df_clean = self.client.query(query).result().to_dataframe( self.df_clean = self.client.query(query).result().to_dataframe(create_bqstorage_client=False)
create_bqstorage_client=False
)
self.logger.info(f" Loaded: {len(self.df_clean):,} rows") self.logger.info(f" Loaded: {len(self.df_clean):,} rows")
if 'is_year_range' in self.df_clean.columns: if 'is_year_range' in self.df_clean.columns:
yr = self.df_clean['is_year_range'].value_counts() yr = self.df_clean['is_year_range'].value_counts()
self.logger.info( self.logger.info(f" Breakdown:")
f" Single years: {yr.get(False, 0):,} | " self.logger.info(f" Single years (is_year_range=False): {yr.get(False, 0):,}")
f"Year ranges: {yr.get(True, 0):,}" self.logger.info(f" Year ranges (is_year_range=True): {yr.get(True, 0):,}")
)
self.df_indicator = read_from_bigquery(self.client, 'dim_indicator', layer='gold') self.df_indicator = read_from_bigquery(self.client, 'dim_indicator', layer='gold')
self.df_country = read_from_bigquery(self.client, 'dim_country', layer='gold') self.df_country = read_from_bigquery(self.client, 'dim_country', layer='gold')
@@ -236,25 +135,20 @@ class AnalyticalLayerLoader:
self.logger.error(f"Error loading source data: {e}") self.logger.error(f"Error loading source data: {e}")
raise raise
# ------------------------------------------------------------------
# STEP 2: DETERMINE YEAR BOUNDARIES
# ------------------------------------------------------------------
def determine_year_boundaries(self): def determine_year_boundaries(self):
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 2: DETERMINE YEAR BOUNDARIES") self.logger.info("STEP 2: DETERMINE YEAR BOUNDARIES")
self.logger.info("=" * 80) self.logger.info("=" * 80)
df_baseline = self.df_clean[self.df_clean['year'] == self.baseline_year] df_2023 = self.df_clean[self.df_clean['year'] == self.baseline_year]
baseline_indicator_count = df_baseline['indicator_id'].nunique() baseline_indicator_count = df_2023['indicator_id'].nunique()
self.logger.info(f"\n Baseline year (hardcode, syarat dosen): {self.baseline_year}") self.logger.info(f"\nBaseline Year: {self.baseline_year}")
self.logger.info(f" Baseline indicator count: {baseline_indicator_count}") self.logger.info(f"Baseline Indicator Count: {baseline_indicator_count}")
years_sorted = sorted(self.df_clean['year'].unique(), reverse=True) years_sorted = sorted(self.df_clean['year'].unique(), reverse=True)
selected_end_year = None selected_end_year = None
self.logger.info(f"\n Scanning end_year (>= {self.baseline_year}):")
for year in years_sorted: for year in years_sorted:
if year >= self.baseline_year: if year >= self.baseline_year:
df_year = self.df_clean[self.df_clean['year'] == year] df_year = self.df_clean[self.df_clean['year'] == year]
@@ -266,9 +160,9 @@ class AnalyticalLayerLoader:
if selected_end_year is None: if selected_end_year is None:
selected_end_year = self.baseline_year selected_end_year = self.baseline_year
self.logger.warning(f" [!] Fallback to baseline: {selected_end_year}") self.logger.warning(f" [!] No year found, using baseline: {selected_end_year}")
else: else:
self.logger.info(f"\n [OK] Selected end year: {selected_end_year}") self.logger.info(f"\n [OK] Selected End Year: {selected_end_year}")
self.end_year = selected_end_year self.end_year = selected_end_year
original_count = len(self.df_clean) original_count = len(self.df_clean)
@@ -283,10 +177,6 @@ class AnalyticalLayerLoader:
self.logger.info(f" Rows after: {len(self.df_clean):,}") self.logger.info(f" Rows after: {len(self.df_clean):,}")
return self.df_clean return self.df_clean
# ------------------------------------------------------------------
# STEP 3: FILTER COMPLETE INDICATORS PER COUNTRY
# ------------------------------------------------------------------
def filter_complete_indicators_per_country(self): def filter_complete_indicators_per_country(self):
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 3: FILTER COMPLETE INDICATORS PER COUNTRY (NO GAPS)") self.logger.info("STEP 3: FILTER COMPLETE INDICATORS PER COUNTRY (NO GAPS)")
@@ -339,14 +229,9 @@ class AnalyticalLayerLoader:
self.logger.info(f" [-] Removed: {len(removed_combinations):,}") self.logger.info(f" [-] Removed: {len(removed_combinations):,}")
df_valid = pd.DataFrame(valid_combinations) df_valid = pd.DataFrame(valid_combinations)
df_valid['key'] = ( df_valid['key'] = df_valid['country_id'].astype(str) + '_' + df_valid['indicator_id'].astype(str)
df_valid['country_id'].astype(str) + '_' + self.df_clean['key'] = (self.df_clean['country_id'].astype(str) + '_' +
df_valid['indicator_id'].astype(str) self.df_clean['indicator_id'].astype(str))
)
self.df_clean['key'] = (
self.df_clean['country_id'].astype(str) + '_' +
self.df_clean['indicator_id'].astype(str)
)
original_count = len(self.df_clean) original_count = len(self.df_clean)
self.df_clean = self.df_clean[self.df_clean['key'].isin(df_valid['key'])].copy() self.df_clean = self.df_clean[self.df_clean['key'].isin(df_valid['key'])].copy()
@@ -358,10 +243,6 @@ class AnalyticalLayerLoader:
self.logger.info(f" Indicators: {self.df_clean['indicator_id'].nunique()}") self.logger.info(f" Indicators: {self.df_clean['indicator_id'].nunique()}")
return self.df_clean return self.df_clean
# ------------------------------------------------------------------
# STEP 4: SELECT COUNTRIES WITH ALL PILLARS
# ------------------------------------------------------------------
def select_countries_with_all_pillars(self): def select_countries_with_all_pillars(self):
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 4: SELECT COUNTRIES WITH ALL PILLARS (FIXED SET)") self.logger.info("STEP 4: SELECT COUNTRIES WITH ALL PILLARS (FIXED SET)")
@@ -384,26 +265,18 @@ class AnalyticalLayerLoader:
f"{row['pillar_count']}/{total_pillars} pillars" f"{row['pillar_count']}/{total_pillars} pillars"
) )
selected_countries = country_pillar_count[ selected_countries = country_pillar_count[country_pillar_count['pillar_count'] == total_pillars]
country_pillar_count['pillar_count'] == total_pillars
]
self.selected_country_ids = selected_countries['country_id'].tolist() self.selected_country_ids = selected_countries['country_id'].tolist()
self.logger.info(f"\n FIXED SET: {len(self.selected_country_ids)} countries") self.logger.info(f"\n FIXED SET: {len(self.selected_country_ids)} countries")
original_count = len(self.df_clean) original_count = len(self.df_clean)
self.df_clean = self.df_clean[ self.df_clean = self.df_clean[self.df_clean['country_id'].isin(self.selected_country_ids)].copy()
self.df_clean['country_id'].isin(self.selected_country_ids)
].copy()
self.logger.info(f" Rows before: {original_count:,}") self.logger.info(f" Rows before: {original_count:,}")
self.logger.info(f" Rows after: {len(self.df_clean):,}") self.logger.info(f" Rows after: {len(self.df_clean):,}")
return self.df_clean return self.df_clean
# ------------------------------------------------------------------
# STEP 5: FILTER INDICATORS CONSISTENT ACROSS FIXED COUNTRIES
# ------------------------------------------------------------------
def filter_indicators_consistent_across_fixed_countries(self): def filter_indicators_consistent_across_fixed_countries(self):
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 5: FILTER INDICATORS WITH CONSISTENT PRESENCE") self.logger.info("STEP 5: FILTER INDICATORS WITH CONSISTENT PRESENCE")
@@ -412,9 +285,7 @@ class AnalyticalLayerLoader:
indicator_country_start = self.df_clean.groupby([ indicator_country_start = self.df_clean.groupby([
'indicator_id', 'indicator_name', 'country_id' 'indicator_id', 'indicator_name', 'country_id'
])['year'].min().reset_index() ])['year'].min().reset_index()
indicator_country_start.columns = [ indicator_country_start.columns = ['indicator_id', 'indicator_name', 'country_id', 'start_year']
'indicator_id', 'indicator_name', 'country_id', 'start_year'
]
indicator_max_start = indicator_country_start.groupby([ indicator_max_start = indicator_country_start.groupby([
'indicator_id', 'indicator_name' 'indicator_id', 'indicator_name'
@@ -463,379 +334,47 @@ class AnalyticalLayerLoader:
raise ValueError("No valid indicators found after filtering!") raise ValueError("No valid indicators found after filtering!")
original_count = len(self.df_clean) original_count = len(self.df_clean)
self.df_clean = self.df_clean[ self.df_clean = self.df_clean[self.df_clean['indicator_id'].isin(valid_indicators)].copy()
self.df_clean['indicator_id'].isin(valid_indicators)
].copy()
self.indicator_max_start_map = ( self.df_clean = self.df_clean.merge(
indicator_max_start[indicator_max_start['indicator_id'].isin(valid_indicators)] indicator_max_start[['indicator_id', 'max_start_year']], on='indicator_id', how='left'
.set_index('indicator_id')['max_start_year']
.to_dict()
) )
self.df_clean = self.df_clean[self.df_clean['year'] >= self.df_clean['max_start_year']].copy()
self.df_clean = self.df_clean.drop('max_start_year', axis=1)
self.logger.info(f"\n Rows before: {original_count:,}") self.logger.info(f"\n Rows before: {original_count:,}")
self.logger.info(f" Rows after: {len(self.df_clean):,}") self.logger.info(f" Rows after: {len(self.df_clean):,}")
self.logger.info(f" Countries: {self.df_clean['country_id'].nunique()}") self.logger.info(f" Countries: {self.df_clean['country_id'].nunique()}")
self.logger.info(f" Indicators: {self.df_clean['indicator_id'].nunique()}") self.logger.info(f" Indicators: {self.df_clean['indicator_id'].nunique()}")
self.logger.info(f" Pillars: {self.df_clean['pillar_id'].nunique()}") self.logger.info(f" Pillars: {self.df_clean['pillar_id'].nunique()}")
self.logger.info(
f"\n [NOTE] Baris year < max_start_year TETAP ADA di data. "
f"Label framework akan ditentukan di Step 6."
)
return self.df_clean return self.df_clean
# ------------------------------------------------------------------
# STEP 6: ASSIGN FRAMEWORK PER ROW
# ------------------------------------------------------------------
def assign_framework(self):
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 6: ASSIGN FRAMEWORK PER ROW")
self.logger.info("=" * 80)
self.logger.info(f"\n SDG_TRANSITION_YEAR : {self.sdg_transition_year} (HARDCODE)")
self.logger.info(f" Alasan : SDGs resmi berlaku 1 Januari 2015")
self.logger.info(f" Bukan auto-detect : data FIES/anaemia ada sejak 2013,")
self.logger.info(f" tapi tahun 2013-2014 harus tetap MDGs")
indicator_info = (
self.df_clean[['indicator_id', 'indicator_name']]
.drop_duplicates()
.copy()
)
indicator_info['is_sdg_only'] = (
indicator_info['indicator_name']
.str.lower()
.str.strip()
.isin(SDG_ONLY_KEYWORDS)
)
sdg_only_ids = set(
indicator_info.loc[indicator_info['is_sdg_only'], 'indicator_id']
)
non_sdg_ids = set(
indicator_info.loc[~indicator_info['is_sdg_only'], 'indicator_id']
)
self.logger.info(f"\n SDG-only indicators ({len(sdg_only_ids)}):")
for _, row in indicator_info[indicator_info['is_sdg_only']].iterrows():
actual_start = self.indicator_max_start_map.get(row['indicator_id'], '?')
self.logger.info(
f" [SDG-only] id={int(row['indicator_id'])} "
f"actual_start={actual_start} | {row['indicator_name']}"
)
self.logger.info(f"\n Non-SDG-only indicators ({len(non_sdg_ids)}): → MDGs selalu")
if not sdg_only_ids:
raise ValueError(
"Tidak ada indikator SDG-only (FIES/anaemia) yang lolos filter. "
"Pastikan nama indikator di SDG_ONLY_KEYWORDS cocok dengan data BigQuery."
)
self.df_clean['_is_sdg_only'] = self.df_clean['indicator_id'].isin(sdg_only_ids)
self.df_clean['framework'] = np.where(
self.df_clean['_is_sdg_only'] &
(self.df_clean['year'] >= self.sdg_transition_year),
'SDGs',
'MDGs'
)
self.df_clean = self.df_clean.drop(columns=['_is_sdg_only'])
self.logger.info(f"\n Logika assign framework (PER BARIS):")
self.logger.info(f" {''*72}")
self.logger.info(f" Indikator TIDAK di SDG_ONLY_KEYWORDS → 'MDGs' di semua tahun")
self.logger.info(f" Indikator DI SDG_ONLY_KEYWORDS:")
self.logger.info(f" year < {self.sdg_transition_year}'MDGs' (data tetap ada, tidak dihapus)")
self.logger.info(f" year >= {self.sdg_transition_year}'SDGs'")
self.logger.info(f" {''*72}")
self.logger.info(f"\n Verifikasi framework per indikator:")
self.logger.info(f" {''*115}")
self.logger.info(
f" {'ID':<5} {'Indicator Name':<52} {'Data From':<11} "
f"{'MDGs rows':<11} {'SDGs rows':<11} {'Note'}"
)
self.logger.info(f" {''*115}")
for ind_id, grp in self.df_clean.groupby('indicator_id'):
ind_name = grp['indicator_name'].iloc[0]
mdgs_rows = (grp['framework'] == 'MDGs').sum()
sdgs_rows = (grp['framework'] == 'SDGs').sum()
is_sdg_only = ind_id in sdg_only_ids
data_from = int(grp['year'].min())
if is_sdg_only:
mdgs_yrs = sorted(grp[grp['framework'] == 'MDGs']['year'].unique())
sdgs_yrs = sorted(grp[grp['framework'] == 'SDGs']['year'].unique())
yr_range_mdgs = f"{min(mdgs_yrs)}-{max(mdgs_yrs)}" if mdgs_yrs else "-"
yr_range_sdgs = f"{min(sdgs_yrs)}-{max(sdgs_yrs)}" if sdgs_yrs else "-"
note = f"MDGs:{yr_range_mdgs} | SDGs:{yr_range_sdgs}"
else:
note = "MDGs always"
self.logger.info(
f" {int(ind_id):<5} {ind_name[:50]:<52} {data_from:<11} "
f"{mdgs_rows:<11} {sdgs_rows:<11} {note}"
)
fw_summary = self.df_clean['framework'].value_counts()
self.logger.info(f"\n Ringkasan rows: " + " | ".join(
f"{fw}: {cnt:,}" for fw, cnt in fw_summary.items()
))
end_year_df = self.df_clean[self.df_clean['year'] == self.end_year]
fw_ind_summary = end_year_df.groupby('framework')['indicator_id'].nunique()
self.logger.info(f" Indicators di year={self.end_year}: " + " | ".join(
f"{fw}: {cnt}" for fw, cnt in fw_ind_summary.items()
))
self.logger.info(
f"\n [OK] 'framework' ditambahkan — "
f"MDGs: {(self.df_clean['framework'] == 'MDGs').sum():,} rows | "
f"SDGs: {(self.df_clean['framework'] == 'SDGs').sum():,} rows"
)
return self.df_clean
# ------------------------------------------------------------------
# STEP 7: VERIFY NO GAPS
# ------------------------------------------------------------------
def verify_no_gaps(self): def verify_no_gaps(self):
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 7: VERIFY NO GAPS") self.logger.info("STEP 6: VERIFY NO GAPS")
self.logger.info("=" * 80) self.logger.info("=" * 80)
expected_countries = len(self.selected_country_ids) expected_countries = len(self.selected_country_ids)
all_good = True verification = self.df_clean.groupby(['indicator_id', 'year'])['country_id'].nunique().reset_index()
bad_rows = [] verification.columns = ['indicator_id', 'year', 'country_count']
all_good = (verification['country_count'] == expected_countries).all()
for ind_id, grp in self.df_clean.groupby('indicator_id'):
actual_start = self.indicator_max_start_map.get(ind_id)
if actual_start is None:
continue
expected_years = list(range(int(actual_start), self.end_year + 1))
for year in expected_years:
country_count = grp[grp['year'] == year]['country_id'].nunique()
if country_count != expected_countries:
all_good = False
bad_rows.append({
'indicator_id' : int(ind_id),
'year' : int(year),
'country_count': int(country_count),
})
if all_good: if all_good:
self.logger.info( self.logger.info(f" VERIFICATION PASSED — all combinations have {expected_countries} countries")
f" VERIFICATION PASSED — all combinations from actual_start_year "
f"have {expected_countries} countries"
)
else: else:
for row in bad_rows[:10]: bad = verification[verification['country_count'] != expected_countries]
for _, row in bad.head(10).iterrows():
self.logger.error( self.logger.error(
f" Indicator {row['indicator_id']}, Year {row['year']}: " f" Indicator {int(row['indicator_id'])}, Year {int(row['year'])}: "
f"{row['country_count']} countries (expected {expected_countries})" f"{int(row['country_count'])} countries (expected {expected_countries})"
) )
raise ValueError("Gap verification failed!") raise ValueError("Gap verification failed!")
return True return True
# ------------------------------------------------------------------
# STEP 8: CALCULATE NORM_VALUE_1_100 PER INDICATOR
# ------------------------------------------------------------------
# PERBAIKAN:
# Normalisasi dilakukan SEKALI per indikator dari SELURUH DATA
# (semua tahun 2013end_year, semua negara, tanpa memisahkan framework).
#
# Alasan:
# - Sebelumnya, rescaling per-framework di analysis_layer menyebabkan
# nilai 1-100 era MDGs dan SDGs memiliki referensi yang berbeda,
# sehingga tidak dapat dibandingkan secara adil.
# - Dengan satu normalisasi global per indikator, nilai 60 di era MDGs
# dan nilai 60 di era SDGs berarti hal yang sama: posisi relatif yang
# sama dalam distribusi historis indikator tersebut.
# - Jika SDGs memang era yang lebih buruk secara substantif, itu akan
# tercermin sebagai nilai norm yang memang lebih rendah — bukan artefak
# dari rescaling ulang.
# ------------------------------------------------------------------
def calculate_norm_value(self):
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 8: CALCULATE NORM_VALUE_1_100 PER INDICATOR (GLOBAL, SEKALI)")
self.logger.info("=" * 80)
self.logger.info(
"\n [PERBAIKAN] Normalisasi dilakukan SEKALI per indikator dari seluruh data."
"\n Tidak ada rescaling ulang per-framework."
"\n Ini memastikan komparabilitas lintas era MDGs dan SDGs."
)
DIRECTION_INVERT = frozenset({
"negative", "lower_better", "lower_is_better", "inverse", "neg",
})
df = self.df_clean.copy()
norm_parts = []
indicators = df.groupby(['indicator_id', 'indicator_name', 'direction'])
self.logger.info(
f"\n {'ID':<5} {'Direction':<15} {'Invert':<8} "
f"{'Min':>10} {'Max':>10} {'Indicator Name'}"
)
self.logger.info(f" {'-'*90}")
for (ind_id, ind_name, direction), grp in indicators:
grp = grp.copy()
do_invert = str(direction).lower().strip() in DIRECTION_INVERT
valid_mask = grp['value'].notna()
n_valid = valid_mask.sum()
if n_valid < 2:
grp['norm_value_1_100'] = np.nan
norm_parts.append(grp)
self.logger.warning(
f" {int(ind_id):<5} {direction:<15} {'N/A':<8} "
f"{'N/A':>10} {'N/A':>10} {ind_name[:45]} [SKIPPED: n_valid={n_valid}]"
)
continue
raw = grp.loc[valid_mask, 'value'].values
v_min = raw.min()
v_max = raw.max()
normed = np.full(len(grp), np.nan)
if v_min == v_max:
# Semua nilai sama → assign tengah skala
normed[valid_mask.values] = 50.5
else:
scaled = (raw - v_min) / (v_max - v_min)
if do_invert:
scaled = 1.0 - scaled
normed[valid_mask.values] = 1.0 + scaled * 99.0
grp['norm_value_1_100'] = normed
self.logger.info(
f" {int(ind_id):<5} {direction:<15} {'YES' if do_invert else 'no':<8} "
f"{v_min:>10.3f} {v_max:>10.3f} {ind_name[:45]}"
)
norm_parts.append(grp)
self.df_clean = pd.concat(norm_parts, ignore_index=True)
valid_norm = self.df_clean['norm_value_1_100'].notna().sum()
null_norm = self.df_clean['norm_value_1_100'].isna().sum()
self.logger.info(f"\n norm_value_1_100 — valid: {valid_norm:,} | null: {null_norm:,}")
self.logger.info(
f" Range aktual: "
f"{self.df_clean['norm_value_1_100'].min():.2f} - "
f"{self.df_clean['norm_value_1_100'].max():.2f}"
)
# ----------------------------------------------------------------
# VALIDASI KOMPARABILITAS: Cek apakah ada gap sistematis antar era
# Ini adalah sinyal diagnostik — bukan error.
# Gap besar (>15 poin) setelah perbaikan = fenomena nyata, bukan artefak.
# ----------------------------------------------------------------
self.logger.info(f"\n [DIAGNOSTIK KOMPARABILITAS] Rata-rata norm per framework per tahun:")
self.logger.info(f" {''*55}")
fw_year_mean = (
self.df_clean
.groupby(['framework', 'year'])['norm_value_1_100']
.mean()
.reset_index()
.sort_values(['framework', 'year'])
)
for fw, grp_fw in fw_year_mean.groupby('framework'):
means = grp_fw['norm_value_1_100'].values
years = grp_fw['year'].values
self.logger.info(f"\n Framework: {fw}")
for yr, m in zip(years, means):
bar = '' * int(m / 5)
self.logger.info(f" {int(yr)} : {m:6.2f} {bar}")
# Bandingkan rata-rata MDGs vs SDGs (hanya tahun di mana keduanya ada)
mdgs_mean_total = self.df_clean[self.df_clean['framework'] == 'MDGs']['norm_value_1_100'].mean()
sdgs_mean_total = self.df_clean[self.df_clean['framework'] == 'SDGs']['norm_value_1_100'].mean()
gap = mdgs_mean_total - sdgs_mean_total
self.logger.info(
f"\n Rata-rata keseluruhan:"
f"\n MDGs : {mdgs_mean_total:.2f}"
f"\n SDGs : {sdgs_mean_total:.2f}"
f"\n Gap : {gap:.2f} poin"
)
if abs(gap) > 15:
self.logger.info(
f"\n [INFO] Gap {gap:.2f} poin antara MDGs dan SDGs."
f"\n Setelah perbaikan normalisasi (satu referensi global),"
f"\n gap ini mencerminkan perbedaan SUBSTANTIF, bukan artefak teknis."
f"\n Indikator SDGs memang mengukur dimensi deprivasi yang lebih dalam"
f"\n (FIES, stunting, wasting, anaemia) dibanding indikator MDGs."
)
else:
self.logger.info(
f"\n [OK] Gap {gap:.2f} poin — dalam batas wajar, tidak ada bias sistematis."
)
# Distribusi kondisi
self.df_clean['_condition_preview'] = (
self.df_clean['norm_value_1_100'].apply(assign_condition)
)
cond_dist = self.df_clean['_condition_preview'].value_counts()
self.logger.info(
f"\n Distribusi kondisi "
f"(threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}):"
)
for cond, cnt in cond_dist.items():
self.logger.info(f" {cond}: {cnt:,} rows")
self.df_clean = self.df_clean.drop(columns=['_condition_preview'])
self.logger.info(f"\n [OK] Kolom 'norm_value_1_100' ditambahkan ke df_clean")
return self.df_clean
# ------------------------------------------------------------------
# STEP 9: CALCULATE YOY
# ------------------------------------------------------------------
def calculate_yoy(self):
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 9: CALCULATE YEAR-OVER-YEAR (YoY) PER INDICATOR PER COUNTRY")
self.logger.info("=" * 80)
df = self.df_clean.sort_values(['country_id', 'indicator_id', 'year']).copy()
df['value_prev'] = df.groupby(['country_id', 'indicator_id'])['value'].shift(1)
df['yoy_change'] = df['value'] - df['value_prev']
df['yoy_pct'] = np.where(
df['value_prev'].notna() & (df['value_prev'] != 0),
(df['yoy_change'] / df['value_prev'].abs()) * 100,
np.nan
)
df = df.drop(columns=['value_prev'])
total_rows = len(df)
valid_yoy = df['yoy_pct'].notna().sum()
null_yoy = df['yoy_pct'].isna().sum()
self.logger.info(f" Total rows : {total_rows:,}")
self.logger.info(f" YoY calculated : {valid_yoy:,}")
self.logger.info(f" YoY NULL (base yr): {null_yoy:,}")
self.df_clean = df
self.logger.info(f" [OK] Kolom 'yoy_change', 'yoy_pct' ditambahkan")
return self.df_clean
# ------------------------------------------------------------------
# STEP 10: ANALYZE INDICATOR AVAILABILITY BY YEAR
# ------------------------------------------------------------------
def analyze_indicator_availability_by_year(self): def analyze_indicator_availability_by_year(self):
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 10: ANALYZE INDICATOR AVAILABILITY BY YEAR") self.logger.info("STEP 7: ANALYZE INDICATOR AVAILABILITY BY YEAR")
self.logger.info("=" * 80) self.logger.info("=" * 80)
year_stats = self.df_clean.groupby('year').agg({ year_stats = self.df_clean.groupby('year').agg({
@@ -861,139 +400,57 @@ class AnalyticalLayerLoader:
'indicator_id', 'indicator_name', 'pillar_name', 'direction', 'indicator_id', 'indicator_name', 'pillar_name', 'direction',
'start_year', 'end_year', 'country_count' 'start_year', 'end_year', 'country_count'
] ]
fw_at_end = (
self.df_clean[self.df_clean['year'] == self.end_year]
.groupby('indicator_id')['framework']
.first()
.reset_index()
)
indicator_details = indicator_details.merge(fw_at_end, on='indicator_id', how='left')
indicator_details['framework'] = indicator_details['framework'].fillna('MDGs')
indicator_details['year_range'] = ( indicator_details['year_range'] = (
indicator_details['start_year'].astype(int).astype(str) + '-' + indicator_details['start_year'].astype(int).astype(str) + '-' +
indicator_details['end_year'].astype(int).astype(str) indicator_details['end_year'].astype(int).astype(str)
) )
indicator_details = indicator_details.sort_values( indicator_details = indicator_details.sort_values(['pillar_name', 'start_year', 'indicator_name'])
['framework', 'pillar_name', 'start_year', 'indicator_name']
)
self.logger.info(f"\nTotal Indicators: {len(indicator_details)}") self.logger.info(f"\nTotal Indicators: {len(indicator_details)}")
self.logger.info(f"Framework breakdown (at end_year={self.end_year}):") for pillar, count in indicator_details.groupby('pillar_name').size().items():
for fw, count in indicator_details.groupby('framework').size().items(): self.logger.info(f" {pillar}: {count} indicators")
self.logger.info(f" {fw}: {count} indicators")
self.logger.info(f"\n{'-'*110}") self.logger.info(f"\n{'-'*100}")
self.logger.info( self.logger.info(f"{'ID':<5} {'Indicator Name':<55} {'Pillar':<15} {'Years':<12} {'Dir':<8} {'Countries'}")
f"{'ID':<5} {'Indicator Name':<55} {'Pillar':<15} " self.logger.info(f"{'-'*100}")
f"{'Framework':<10} {'Years':<12} {'Dir':<8} {'Countries'}"
)
self.logger.info(f"{'-'*110}")
for _, row in indicator_details.iterrows(): for _, row in indicator_details.iterrows():
direction = 'higher+' if row['direction'] == 'higher_better' else 'lower-' direction = 'higher+' if row['direction'] == 'higher_better' else 'lower-'
self.logger.info( self.logger.info(
f"{int(row['indicator_id']):<5} {row['indicator_name'][:52]:<55} " f"{int(row['indicator_id']):<5} {row['indicator_name'][:52]:<55} "
f"{row['pillar_name'][:13]:<15} {row['framework']:<10} " f"{row['pillar_name'][:13]:<15} {row['year_range']:<12} "
f"{row['year_range']:<12} {direction:<8} {int(row['country_count'])}" f"{direction:<8} {int(row['country_count'])}"
) )
return year_stats return year_stats
# ------------------------------------------------------------------
# STEP 11: SAVE ANALYTICAL TABLE
# ------------------------------------------------------------------
def save_analytical_table(self): def save_analytical_table(self):
table_name = 'fact_asean_food_security_selected' table_name = 'analytical_food_security'
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info(f"STEP 11: SAVE TO [DW/Gold] {table_name} -> fs_asean_gold") self.logger.info(f"STEP 8: SAVE TO [DW/Gold] {table_name} -> fs_asean_gold")
self.logger.info("=" * 80) self.logger.info("=" * 80)
try: try:
if 'framework' not in self.df_clean.columns:
raise ValueError("Kolom 'framework' tidak ada. Pastikan Step 6 sudah dijalankan.")
if 'norm_value_1_100' not in self.df_clean.columns:
raise ValueError("Kolom 'norm_value_1_100' tidak ada. Pastikan Step 8 sudah dijalankan.")
if 'yoy_change' not in self.df_clean.columns:
raise ValueError("Kolom 'yoy_change' tidak ada. Pastikan Step 9 sudah dijalankan.")
analytical_df = self.df_clean[[ analytical_df = self.df_clean[[
'country_id', 'country_id', 'indicator_id', 'pillar_id', 'time_id', 'value'
'country_name',
'indicator_id',
'indicator_name',
'direction',
'framework',
'pillar_id',
'pillar_name',
'time_id',
'year',
'value',
'norm_value_1_100',
'yoy_change',
'yoy_pct',
]].copy() ]].copy()
analytical_df = analytical_df.sort_values( analytical_df = analytical_df.sort_values(
['year', 'country_name', 'pillar_name', 'indicator_name'] ['time_id', 'country_id', 'indicator_id']
).reset_index(drop=True) ).reset_index(drop=True)
analytical_df['country_id'] = analytical_df['country_id'].astype(int) analytical_df['country_id'] = analytical_df['country_id'].astype(int)
analytical_df['country_name'] = analytical_df['country_name'].astype(str)
analytical_df['indicator_id'] = analytical_df['indicator_id'].astype(int) analytical_df['indicator_id'] = analytical_df['indicator_id'].astype(int)
analytical_df['indicator_name'] = analytical_df['indicator_name'].astype(str)
analytical_df['direction'] = analytical_df['direction'].astype(str)
analytical_df['framework'] = analytical_df['framework'].astype(str)
analytical_df['pillar_id'] = analytical_df['pillar_id'].astype(int) analytical_df['pillar_id'] = analytical_df['pillar_id'].astype(int)
analytical_df['pillar_name'] = analytical_df['pillar_name'].astype(str)
analytical_df['time_id'] = analytical_df['time_id'].astype(int) analytical_df['time_id'] = analytical_df['time_id'].astype(int)
analytical_df['year'] = analytical_df['year'].astype(int)
analytical_df['value'] = analytical_df['value'].astype(float) analytical_df['value'] = analytical_df['value'].astype(float)
analytical_df['norm_value_1_100'] = analytical_df['norm_value_1_100'].astype(float)
analytical_df['yoy_change'] = analytical_df['yoy_change'].astype(float)
analytical_df['yoy_pct'] = analytical_df['yoy_pct'].astype(float)
self.logger.info(f" Total rows: {len(analytical_df):,}") self.logger.info(f" Total rows: {len(analytical_df):,}")
fw_dist_rows = analytical_df['framework'].value_counts()
self.logger.info(f" Framework distribution (rows):")
for fw, cnt in fw_dist_rows.items():
self.logger.info(f" {fw}: {cnt:,} rows")
fw_dist_ind = (
analytical_df[analytical_df['year'] == self.end_year]
.drop_duplicates('indicator_id')['framework']
.value_counts()
)
self.logger.info(
f" Framework distribution (indicators at year={self.end_year}):"
)
for fw, cnt in fw_dist_ind.items():
self.logger.info(f" {fw}: {cnt} indicators")
self.logger.info(
f" norm_value_1_100 range: "
f"{analytical_df['norm_value_1_100'].min():.2f} - "
f"{analytical_df['norm_value_1_100'].max():.2f}"
)
schema = [ schema = [
bigquery.SchemaField("country_id", "INTEGER", mode="REQUIRED"), bigquery.SchemaField("country_id", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("country_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("indicator_id", "INTEGER", mode="REQUIRED"), bigquery.SchemaField("indicator_id", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("indicator_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("direction", "STRING", mode="REQUIRED"),
bigquery.SchemaField("framework", "STRING", mode="REQUIRED"),
bigquery.SchemaField("pillar_id", "INTEGER", mode="REQUIRED"), bigquery.SchemaField("pillar_id", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("pillar_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("time_id", "INTEGER", mode="REQUIRED"), bigquery.SchemaField("time_id", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("year", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("value", "FLOAT", mode="REQUIRED"), bigquery.SchemaField("value", "FLOAT", mode="REQUIRED"),
bigquery.SchemaField("norm_value_1_100", "FLOAT", mode="NULLABLE"),
bigquery.SchemaField("yoy_change", "FLOAT", mode="NULLABLE"),
bigquery.SchemaField("yoy_pct", "FLOAT", mode="NULLABLE"),
] ]
rows_loaded = load_to_bigquery( rows_loaded = load_to_bigquery(
@@ -1016,63 +473,31 @@ class AnalyticalLayerLoader:
'config_snapshot' : json.dumps({ 'config_snapshot' : json.dumps({
'start_year' : self.start_year, 'start_year' : self.start_year,
'end_year' : self.end_year, 'end_year' : self.end_year,
'baseline_year' : self.baseline_year,
'sdg_transition_year' : self.sdg_transition_year,
'sdg_transition_source' : 'HARDCODE — SDGs resmi berlaku 1 Jan 2015',
'fixed_countries': len(self.selected_country_ids), 'fixed_countries': len(self.selected_country_ids),
'norm_scale' : ( 'no_gaps' : True,
'1-100 per indicator global minmax direction-aware. ' 'layer' : 'gold'
'SATU normalisasi untuk seluruh data tanpa rescaling per-framework. '
'Komparabilitas lintas era MDGs/SDGs terjamin.'
),
'framework_logic' : (
f'SDG_TRANSITION_YEAR={SDG_TRANSITION_YEAR} (HARDCODE); '
'SDG-only + year >= SDG_TRANSITION_YEAR → SDGs; '
'SDG-only + year < SDG_TRANSITION_YEAR → MDGs (data tetap ada); '
'non-SDG-only → MDGs selalu'
),
'sdg_only_keywords_count': len(SDG_ONLY_KEYWORDS),
'condition_thresholds' : {
'bad' : f'< {THRESHOLD_BAD}',
'moderate': f'{THRESHOLD_BAD}-{THRESHOLD_GOOD}',
'good' : f'> {THRESHOLD_GOOD}',
},
}), }),
'validation_metrics' : json.dumps({ 'validation_metrics' : json.dumps({
'fixed_countries' : len(self.selected_country_ids), 'fixed_countries' : len(self.selected_country_ids),
'total_indicators' : int(self.df_clean['indicator_id'].nunique()), 'total_indicators': int(self.df_clean['indicator_id'].nunique())
'sdg_transition_year': self.sdg_transition_year,
'framework_dist_rows': fw_dist_rows.to_dict(),
}) })
} }
save_etl_metadata(self.client, metadata) save_etl_metadata(self.client, metadata)
self.logger.info(f" [OK] {table_name}: {rows_loaded:,} rows -> fs_asean_gold") self.logger.info(f" {table_name}: {rows_loaded:,} rows → [DW/Gold] fs_asean_gold")
self.logger.info(f" Metadata → [AUDIT] etl_metadata")
return rows_loaded return rows_loaded
except Exception as e: except Exception as e:
self.logger.error(f"Error saving: {e}") self.logger.error(f"Error saving: {e}")
raise raise
# ------------------------------------------------------------------
# RUN
# ------------------------------------------------------------------
def run(self): def run(self):
self.pipeline_start = datetime.now() self.pipeline_start = datetime.now()
self.pipeline_metadata['start_time'] = self.pipeline_start self.pipeline_metadata['start_time'] = self.pipeline_start
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("Output: fact_asean_food_security_selected -> fs_asean_gold") self.logger.info("Output: analytical_food_security fs_asean_gold")
self.logger.info("Kolom baru: norm_value_1_100 (min-max 1-100, direction-aware)")
self.logger.info(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
self.logger.info(
f"Framework: SDG_TRANSITION_YEAR={SDG_TRANSITION_YEAR} (HARDCODE). "
"SDG-only + year >= 2015 → SDGs; sebelumnya MDGs. Non-SDG-only → MDGs selalu."
)
self.logger.info(
"NORMALISASI: SATU referensi global per indikator — tidak ada rescaling per-framework."
)
self.logger.info("=" * 80) self.logger.info("=" * 80)
self.load_source_data() self.load_source_data()
@@ -1080,10 +505,7 @@ class AnalyticalLayerLoader:
self.filter_complete_indicators_per_country() self.filter_complete_indicators_per_country()
self.select_countries_with_all_pillars() self.select_countries_with_all_pillars()
self.filter_indicators_consistent_across_fixed_countries() self.filter_indicators_consistent_across_fixed_countries()
self.assign_framework()
self.verify_no_gaps() self.verify_no_gaps()
self.calculate_norm_value()
self.calculate_yoy()
self.analyze_indicator_availability_by_year() self.analyze_indicator_availability_by_year()
self.save_analytical_table() self.save_analytical_table()
@@ -1095,7 +517,6 @@ class AnalyticalLayerLoader:
self.logger.info("=" * 80) self.logger.info("=" * 80)
self.logger.info(f" Duration : {duration:.2f}s") self.logger.info(f" Duration : {duration:.2f}s")
self.logger.info(f" Year Range : {self.start_year}-{self.end_year}") self.logger.info(f" Year Range : {self.start_year}-{self.end_year}")
self.logger.info(f" SDG Transition Year: {self.sdg_transition_year} (HARDCODE)")
self.logger.info(f" Countries : {len(self.selected_country_ids)}") self.logger.info(f" Countries : {len(self.selected_country_ids)}")
self.logger.info(f" Indicators : {self.df_clean['indicator_id'].nunique()}") self.logger.info(f" Indicators : {self.df_clean['indicator_id'].nunique()}")
self.logger.info(f" Rows Loaded: {self.pipeline_metadata['rows_loaded']:,}") self.logger.info(f" Rows Loaded: {self.pipeline_metadata['rows_loaded']:,}")
@@ -1106,6 +527,10 @@ class AnalyticalLayerLoader:
# ============================================================================= # =============================================================================
def run_analytical_layer(): def run_analytical_layer():
"""
Airflow task: Build analytical_food_security dari fact_food_security + dims.
Dipanggil setelah dimensional_model_to_gold selesai.
"""
from scripts.bigquery_config import get_bigquery_client from scripts.bigquery_config import get_bigquery_client
client = get_bigquery_client() client = get_bigquery_client()
loader = AnalyticalLayerLoader(client) loader = AnalyticalLayerLoader(client)
@@ -1119,14 +544,7 @@ def run_analytical_layer():
if __name__ == "__main__": if __name__ == "__main__":
print("=" * 80) print("=" * 80)
print("BIGQUERY ANALYTICAL LAYER - DATA FILTERING") print("Output: analytical_food_security → fs_asean_gold")
print("Output: fact_asean_food_security_selected -> fs_asean_gold")
print(f"Norm: min-max 1-100 per indicator, direction-aware, GLOBAL (satu referensi)")
print(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
print(
f"Framework: SDG_TRANSITION_YEAR={SDG_TRANSITION_YEAR} (HARDCODE). "
"SDG-only + year >= 2015 → SDGs; sebelumnya MDGs. Non-SDG-only → MDGs selalu."
)
print("=" * 80) print("=" * 80)
logger = setup_logging() logger = setup_logging()
@@ -1136,6 +554,4 @@ if __name__ == "__main__":
print("\n" + "=" * 80) print("\n" + "=" * 80)
print("[OK] COMPLETED") print("[OK] COMPLETED")
print(f" SDG Transition Year : {loader.sdg_transition_year} (HARDCODE)")
print(f" Rows Loaded : {loader.pipeline_metadata['rows_loaded']:,}")
print("=" * 80) print("=" * 80)

View File

@@ -40,7 +40,7 @@ def load_staging_data(client: bigquery.Client) -> pd.DataFrame:
"""Load data dari staging_integrated (STAGING/Silver layer).""" """Load data dari staging_integrated (STAGING/Silver layer)."""
print("\nLoading data from staging_integrated (fs_asean_silver)...") print("\nLoading data from staging_integrated (fs_asean_silver)...")
df_staging = read_from_bigquery(client, 'staging_integrated', layer='silver') df_staging = read_from_bigquery(client, 'staging_integrated', layer='silver')
print(f" Loaded : {len(df_staging):,} rows") print(f" Loaded : {len(df_staging):,} rows")
print(f" Columns : {len(df_staging.columns)}") print(f" Columns : {len(df_staging.columns)}")
print(f" Sources : {df_staging['source'].nunique()}") print(f" Sources : {df_staging['source'].nunique()}")
print(f" Indicators : {df_staging['indicator_standardized'].nunique()}") print(f" Indicators : {df_staging['indicator_standardized'].nunique()}")
@@ -53,6 +53,7 @@ def load_staging_data(client: bigquery.Client) -> pd.DataFrame:
# COLUMN CONSTRAINT HELPERS # COLUMN CONSTRAINT HELPERS
# ============================================================================= # =============================================================================
# Schema constraints — semua varchar max lengths
COLUMN_CONSTRAINTS = { COLUMN_CONSTRAINTS = {
'source' : 20, 'source' : 20,
'indicator_original' : 255, 'indicator_original' : 255,
@@ -61,7 +62,7 @@ COLUMN_CONSTRAINTS = {
'year_range' : 20, 'year_range' : 20,
'unit' : 20, 'unit' : 20,
'pillar' : 20, 'pillar' : 20,
'direction' : 15, 'direction' : 15, # 'higher_better'=13, 'lower_better'=12
} }
@@ -100,11 +101,11 @@ def apply_column_constraints(df: pd.DataFrame) -> pd.DataFrame:
) )
if truncation_report: if truncation_report:
print("\n Column Truncations Applied:") print("\n Column Truncations Applied:")
for column, info in truncation_report.items(): for column, info in truncation_report.items():
print(f" - {column}: {info['count']} values truncated to {info['max_length']} chars") print(f" - {column}: {info['count']} values truncated to {info['max_length']} chars")
else: else:
print("\n No truncations needed — all values within constraints") print("\n No truncations needed — all values within constraints")
return df_constrained return df_constrained
@@ -176,16 +177,16 @@ def standardize_country_names_asean(df: pd.DataFrame, country_column: str = 'cou
def assign_pillar(indicator_name: str) -> str: def assign_pillar(indicator_name: str) -> str:
""" """
Assign pillar berdasarkan keyword indikator. Assign pillar berdasarkan keyword indikator.
Return values: 'Availability', 'Access', 'Utilization', 'Stability', 'Supporting' Return values: 'Availability', 'Access', 'Utilization', 'Stability', 'Other'
All <= 20 chars (varchar(20) constraint). All 20 chars (varchar(20) constraint).
""" """
if pd.isna(indicator_name): if pd.isna(indicator_name):
return 'Supporting' return 'Other'
ind = str(indicator_name).lower() ind = str(indicator_name).lower()
for kw in ['requirement', 'coefficient', 'losses', 'fat supply']: for kw in ['requirement', 'coefficient', 'losses', 'fat supply']:
if kw in ind: if kw in ind:
return 'Supporting' return 'Other'
if any(kw in ind for kw in [ if any(kw in ind for kw in [
'adequacy', 'protein supply', 'supply of protein', 'adequacy', 'protein supply', 'supply of protein',
@@ -209,13 +210,12 @@ def assign_pillar(indicator_name: str) -> str:
if any(kw in ind for kw in [ if any(kw in ind for kw in [
'wasting', 'wasted', 'stunted', 'overweight', 'obese', 'obesity', 'wasting', 'wasted', 'stunted', 'overweight', 'obese', 'obesity',
'anemia', 'anaemia', 'birthweight', 'breastfeeding', 'drinking water', 'anemia', 'birthweight', 'breastfeeding', 'drinking water', 'sanitation',
'sanitation', 'children under 5', 'newborns with low', 'children under 5', 'newborns with low', 'women of reproductive'
'women of reproductive'
]): ]):
return 'Utilization' return 'Utilization'
return 'Supporting' return 'Other'
# ============================================================================= # =============================================================================
@@ -226,15 +226,17 @@ def assign_direction(indicator_name: str) -> str:
""" """
Assign direction berdasarkan indikator. Assign direction berdasarkan indikator.
Return values: 'higher_better' (13 chars) atau 'lower_better' (12 chars) Return values: 'higher_better' (13 chars) atau 'lower_better' (12 chars)
Both <= 15 chars (varchar(15) constraint). Both 15 chars (varchar(15) constraint).
""" """
if pd.isna(indicator_name): if pd.isna(indicator_name):
return 'higher_better' return 'higher_better'
ind = str(indicator_name).lower() ind = str(indicator_name).lower()
# Spesifik lower_better
if 'share of dietary energy supply derived from cereals' in ind: if 'share of dietary energy supply derived from cereals' in ind:
return 'lower_better' return 'lower_better'
# Higher_better exceptions — cek sebelum lower_better keywords
for kw in [ for kw in [
'exclusive breastfeeding', 'exclusive breastfeeding',
'dietary energy supply', 'dietary energy supply',
@@ -246,6 +248,7 @@ def assign_direction(indicator_name: str) -> str:
if kw in ind: if kw in ind:
return 'higher_better' return 'higher_better'
# Lower_better — masalah yang harus diminimalkan
for kw in [ for kw in [
'prevalence of undernourishment', 'prevalence of undernourishment',
'prevalence of severe food insecurity', 'prevalence of severe food insecurity',
@@ -256,7 +259,6 @@ def assign_direction(indicator_name: str) -> str:
'prevalence of overweight', 'prevalence of overweight',
'prevalence of obesity', 'prevalence of obesity',
'prevalence of anemia', 'prevalence of anemia',
'prevalence of anaemia',
'prevalence of low birthweight', 'prevalence of low birthweight',
'number of people undernourished', 'number of people undernourished',
'number of severely food insecure', 'number of severely food insecure',
@@ -281,9 +283,6 @@ def assign_direction(indicator_name: str) -> str:
'coefficient of variation', 'coefficient of variation',
'incidence of caloric losses', 'incidence of caloric losses',
'food losses', 'food losses',
'indicator of food price anomalies',
'proportion of local breeds classified as being at risk',
'agricultural export subsidies',
]: ]:
if kw in ind: if kw in ind:
return 'lower_better' return 'lower_better'
@@ -300,18 +299,19 @@ class CleanedDataLoader:
Loader untuk cleaned integrated data ke STAGING layer (Silver). Loader untuk cleaned integrated data ke STAGING layer (Silver).
Kimball context: Kimball context:
Input : staging_integrated -> STAGING (Silver) — fs_asean_silver Input : staging_integrated STAGING (Silver) — fs_asean_silver
Output : cleaned_integrated -> STAGING (Silver) — fs_asean_silver Output : cleaned_integrated STAGING (Silver) — fs_asean_silver
Audit : etl_logs, etl_metadata -> AUDIT — fs_asean_audit Audit : etl_logs, etl_metadata AUDIT — fs_asean_audit
Pipeline steps: Pipeline steps:
1. Standardize country names (ASEAN) 1. Standardize country names (ASEAN)
2. Remove missing values 2. Remove missing values
3. Remove duplicates 3. Remove duplicates
4. Add pillar & direction classification 4. Add pillar classification
5. Apply column constraints 5. Add direction classification
6. Load ke BigQuery 6. Apply column constraints
7. Log ke Audit layer 7. Load ke BigQuery
8. Log ke Audit layer
""" """
SCHEMA = [ SCHEMA = [
@@ -355,7 +355,7 @@ class CleanedDataLoader:
def _step_standardize_countries(self, df: pd.DataFrame) -> pd.DataFrame: def _step_standardize_countries(self, df: pd.DataFrame) -> pd.DataFrame:
print("\n [Step 1/5] Standardize country names...") print("\n [Step 1/5] Standardize country names...")
df, report = standardize_country_names_asean(df, country_column='country') df, report = standardize_country_names_asean(df, country_column='country')
print(f" ASEAN countries mapped : {report['countries_mapped']}") print(f" ASEAN countries mapped : {report['countries_mapped']}")
unique_countries = sorted(df['country'].unique()) unique_countries = sorted(df['country'].unique())
print(f" Countries ({len(unique_countries)}) : {', '.join(unique_countries)}") print(f" Countries ({len(unique_countries)}) : {', '.join(unique_countries)}")
log_update(self.client, 'STAGING', 'staging_integrated', log_update(self.client, 'STAGING', 'staging_integrated',
@@ -377,9 +377,7 @@ class CleanedDataLoader:
def _step_remove_duplicates(self, df: pd.DataFrame) -> pd.DataFrame: def _step_remove_duplicates(self, df: pd.DataFrame) -> pd.DataFrame:
print("\n [Step 3/5] Remove duplicates...") print("\n [Step 3/5] Remove duplicates...")
exact_dups = df.duplicated().sum() exact_dups = df.duplicated().sum()
data_dups = df.duplicated( data_dups = df.duplicated(subset=['indicator_standardized', 'country', 'year', 'value']).sum()
subset=['indicator_standardized', 'country', 'year', 'value']
).sum()
print(f" Exact duplicates : {exact_dups:,}") print(f" Exact duplicates : {exact_dups:,}")
print(f" Data duplicates : {data_dups:,}") print(f" Data duplicates : {data_dups:,}")
rows_before = len(df) rows_before = len(df)
@@ -393,21 +391,19 @@ class CleanedDataLoader:
def _step_add_classifications(self, df: pd.DataFrame) -> pd.DataFrame: def _step_add_classifications(self, df: pd.DataFrame) -> pd.DataFrame:
print("\n [Step 4/5] Add pillar & direction classification...") print("\n [Step 4/5] Add pillar & direction classification...")
df = df.copy() df = df.copy()
df['pillar'] = df['indicator_standardized'].apply(assign_pillar) df['pillar'] = df['indicator_standardized'].apply(assign_pillar)
df['direction'] = df['indicator_standardized'].apply(assign_direction) df['direction'] = df['indicator_standardized'].apply(assign_direction)
pillar_counts = df['pillar'].value_counts() pillar_counts = df['pillar'].value_counts()
print(f" Pillar distribution:") print(f" Pillar distribution:")
for pillar, count in pillar_counts.items(): for pillar, count in pillar_counts.items():
print(f" - {pillar}: {count:,}") print(f" - {pillar}: {count:,}")
direction_counts = df['direction'].value_counts() direction_counts = df['direction'].value_counts()
print(f" Direction distribution:") print(f" Direction distribution:")
for direction, count in direction_counts.items(): for direction, count in direction_counts.items():
pct = count / len(df) * 100 pct = count / len(df) * 100
print(f" - {direction}: {count:,} ({pct:.1f}%)") print(f" - {direction}: {count:,} ({pct:.1f}%)")
return df return df
def _step_apply_constraints(self, df: pd.DataFrame) -> pd.DataFrame: def _step_apply_constraints(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -442,6 +438,7 @@ class CleanedDataLoader:
if 'country' in df.columns: if 'country' in df.columns:
validation['unique_countries'] = int(df['country'].nunique()) validation['unique_countries'] = int(df['country'].nunique())
# Column length check
column_length_check = {} column_length_check = {}
for col, max_len in COLUMN_CONSTRAINTS.items(): for col, max_len in COLUMN_CONSTRAINTS.items():
if col in df.columns: if col in df.columns:
@@ -460,7 +457,7 @@ class CleanedDataLoader:
def run(self, df: pd.DataFrame) -> int: def run(self, df: pd.DataFrame) -> int:
""" """
Execute full cleaning pipeline -> load ke STAGING (Silver). Execute full cleaning pipeline load ke STAGING (Silver).
Returns: Returns:
int: Rows loaded int: Rows loaded
@@ -472,6 +469,7 @@ class CleanedDataLoader:
print(" ERROR: DataFrame is empty, nothing to process.") print(" ERROR: DataFrame is empty, nothing to process.")
return 0 return 0
# Pipeline steps
df = self._step_standardize_countries(df) df = self._step_standardize_countries(df)
df = self._step_remove_missing(df) df = self._step_remove_missing(df)
df = self._step_remove_duplicates(df) df = self._step_remove_duplicates(df)
@@ -480,6 +478,7 @@ class CleanedDataLoader:
self.metadata['rows_transformed'] = len(df) self.metadata['rows_transformed'] = len(df)
# Validate
validation = self.validate_data(df) validation = self.validate_data(df)
self.metadata['validation_metrics'] = validation self.metadata['validation_metrics'] = validation
@@ -488,12 +487,13 @@ class CleanedDataLoader:
for info in validation.get('column_length_check', {}).values() for info in validation.get('column_length_check', {}).values()
) )
if not all_within_limits: if not all_within_limits:
print("\n WARNING: Some columns still exceed length constraints!") print("\n WARNING: Some columns still exceed length constraints!")
for col, info in validation['column_length_check'].items(): for col, info in validation['column_length_check'].items():
if not info['within_limit']: if not info['within_limit']:
print(f" - {col}: {info['max_actual_length']} > {info['max_length_constraint']}") print(f" - {col}: {info['max_actual_length']} > {info['max_length_constraint']}")
print(f"\n Loading to [STAGING/Silver] {self.table_name} -> fs_asean_silver...") # Load ke Silver
print(f"\n Loading to [STAGING/Silver] {self.table_name} → fs_asean_silver...")
rows_loaded = load_to_bigquery( rows_loaded = load_to_bigquery(
self.client, df, self.table_name, self.client, df, self.table_name,
layer='silver', layer='silver',
@@ -502,8 +502,10 @@ class CleanedDataLoader:
) )
self.metadata['rows_loaded'] = rows_loaded self.metadata['rows_loaded'] = rows_loaded
# Audit logs
log_update(self.client, 'STAGING', self.table_name, 'full_refresh', rows_loaded) log_update(self.client, 'STAGING', self.table_name, 'full_refresh', rows_loaded)
# ETL metadata
self.metadata['end_time'] = datetime.now() self.metadata['end_time'] = datetime.now()
self.metadata['duration_seconds'] = ( self.metadata['duration_seconds'] = (
self.metadata['end_time'] - self.metadata['start_time'] self.metadata['end_time'] - self.metadata['start_time']
@@ -514,31 +516,33 @@ class CleanedDataLoader:
self.metadata['validation_metrics'] = json.dumps(validation) self.metadata['validation_metrics'] = json.dumps(validation)
save_etl_metadata(self.client, self.metadata) save_etl_metadata(self.client, self.metadata)
print(f"\n Cleaned Integration completed: {rows_loaded:,} rows") # Summary
print(f"\n ✓ Cleaned Integration completed: {rows_loaded:,} rows")
print(f" Duration : {self.metadata['duration_seconds']:.2f}s") print(f" Duration : {self.metadata['duration_seconds']:.2f}s")
print(f" Completeness : {validation['completeness_pct']:.2f}%") print(f" Completeness : {validation['completeness_pct']:.2f}%")
if 'year_range' in validation: if 'year_range' in validation:
yr = validation['year_range'] yr = validation['year_range']
if yr['min'] and yr['max']: if yr['min'] and yr['max']:
print(f" Year range : {yr['min']}-{yr['max']}") print(f" Year range : {yr['min']}{yr['max']}")
print(f" Indicators : {validation.get('unique_indicators', '-')}") print(f" Indicators : {validation.get('unique_indicators', '-')}")
print(f" Countries : {validation.get('unique_countries', '-')}") print(f" Countries : {validation.get('unique_countries', '-')}")
print(f"\n Schema Validation:") print(f"\n Schema Validation:")
for col, info in validation.get('column_length_check', {}).items(): for col, info in validation.get('column_length_check', {}).items():
status = "OK" if info['within_limit'] else "FAIL" status = "" if info['within_limit'] else ""
print(f" [{status}] {col}: {info['max_actual_length']}/{info['max_length_constraint']}") print(f" {status} {col}: {info['max_actual_length']}/{info['max_length_constraint']}")
print(f"\n Metadata -> [AUDIT] etl_metadata") print(f"\n Metadata [AUDIT] etl_metadata")
return rows_loaded return rows_loaded
# ============================================================================= # =============================================================================
# AIRFLOW TASK FUNCTIONS # AIRFLOW TASK FUNCTIONS ← sama polanya dengan raw layer
# ============================================================================= # =============================================================================
def run_cleaned_integration(): def run_cleaned_integration():
""" """
Airflow task: Load cleaned_integrated dari staging_integrated. Airflow task: Load cleaned_integrated dari staging_integrated.
Dipanggil oleh DAG setelah task staging_integration_to_silver selesai. Dipanggil oleh DAG setelah task staging_integration_to_silver selesai.
""" """
from scripts.bigquery_config import get_bigquery_client from scripts.bigquery_config import get_bigquery_client
@@ -557,21 +561,21 @@ if __name__ == "__main__":
print("=" * 60) print("=" * 60)
print("BIGQUERY CLEANED LAYER ETL") print("BIGQUERY CLEANED LAYER ETL")
print("Kimball DW Architecture") print("Kimball DW Architecture")
print(" Input : STAGING (Silver) -> staging_integrated") print(" Input : STAGING (Silver) staging_integrated")
print(" Output : STAGING (Silver) -> cleaned_integrated") print(" Output : STAGING (Silver) cleaned_integrated")
print(" Audit : AUDIT -> etl_logs, etl_metadata") print(" Audit : AUDIT etl_logs, etl_metadata")
print("=" * 60) print("=" * 60)
logger = setup_logging() logger = setup_logging()
client = get_bigquery_client() client = get_bigquery_client()
df_staging = load_staging_data(client) df_staging = load_staging_data(client)
print("\n[1/1] Cleaned Integration -> STAGING (Silver)...") print("\n[1/1] Cleaned Integration STAGING (Silver)...")
loader = CleanedDataLoader(client, load_mode='full_refresh') loader = CleanedDataLoader(client, load_mode='full_refresh')
final_count = loader.run(df_staging) final_count = loader.run(df_staging)
print("\n" + "=" * 60) print("\n" + "=" * 60)
print("[OK] CLEANED LAYER ETL COMPLETED") print(" CLEANED LAYER ETL COMPLETED")
print(f" STAGING (Silver) : cleaned_integrated ({final_count:,} rows)") print(f" 🥈 STAGING (Silver) : cleaned_integrated ({final_count:,} rows)")
print(f" AUDIT : etl_logs, etl_metadata") print(f" 📋 AUDIT : etl_logs, etl_metadata")
print("=" * 60) print("=" * 60)

View File

@@ -46,9 +46,9 @@ class DimensionalModelLoader:
Loader untuk dimensional model ke DW layer (Gold) — fs_asean_gold. Loader untuk dimensional model ke DW layer (Gold) — fs_asean_gold.
Kimball context: Kimball context:
Input : cleaned_integrated -> STAGING (Silver) — fs_asean_silver Input : cleaned_integrated STAGING (Silver) — fs_asean_silver
Output : dim_* + fact_* -> DW (Gold) — fs_asean_gold Output : dim_* + fact_* DW (Gold) — fs_asean_gold
Audit : etl_logs, etl_metadata -> AUDIT — fs_asean_audit Audit : etl_logs, etl_metadata AUDIT — fs_asean_audit
Pipeline steps: Pipeline steps:
1. Load dim_country 1. Load dim_country
@@ -117,7 +117,7 @@ class DimensionalModelLoader:
""" """
try: try:
self.client.query(query).result() self.client.query(query).result()
self.logger.info(f" [OK] FK: {table_name}.{fk_column} -> {ref_table}.{ref_column}") self.logger.info(f" [OK] FK: {table_name}.{fk_column} {ref_table}.{ref_column}")
except Exception as e: except Exception as e:
if "already exists" in str(e).lower(): if "already exists" in str(e).lower():
self.logger.info(f" [INFO] FK already exists: {constraint_name}") self.logger.info(f" [INFO] FK already exists: {constraint_name}")
@@ -145,7 +145,7 @@ class DimensionalModelLoader:
} }
try: try:
save_etl_metadata(self.client, metadata) save_etl_metadata(self.client, metadata)
self.logger.info(f" Metadata -> [AUDIT] etl_metadata") self.logger.info(f" Metadata [AUDIT] etl_metadata")
except Exception as e: except Exception as e:
self.logger.warning(f" [WARN] Could not save metadata for {table_name}: {e}") self.logger.warning(f" [WARN] Could not save metadata for {table_name}: {e}")
@@ -156,7 +156,7 @@ class DimensionalModelLoader:
def load_dim_time(self): def load_dim_time(self):
table_name = 'dim_time' table_name = 'dim_time'
self.load_metadata[table_name]['start_time'] = datetime.now() self.load_metadata[table_name]['start_time'] = datetime.now()
self.logger.info("Loading dim_time -> [DW/Gold] fs_asean_gold...") self.logger.info("Loading dim_time [DW/Gold] fs_asean_gold...")
try: try:
if 'year_range' in self.df_clean.columns: if 'year_range' in self.df_clean.columns:
@@ -229,7 +229,7 @@ class DimensionalModelLoader:
) )
log_update(self.client, 'DW', table_name, 'full_load', rows_loaded) log_update(self.client, 'DW', table_name, 'full_load', rows_loaded)
self._save_table_metadata(table_name) self._save_table_metadata(table_name)
self.logger.info(f" dim_time: {rows_loaded} rows\n") self.logger.info(f" dim_time: {rows_loaded} rows\n")
return rows_loaded return rows_loaded
except Exception as e: except Exception as e:
@@ -240,7 +240,7 @@ class DimensionalModelLoader:
def load_dim_country(self): def load_dim_country(self):
table_name = 'dim_country' table_name = 'dim_country'
self.load_metadata[table_name]['start_time'] = datetime.now() self.load_metadata[table_name]['start_time'] = datetime.now()
self.logger.info("Loading dim_country -> [DW/Gold] fs_asean_gold...") self.logger.info("Loading dim_country [DW/Gold] fs_asean_gold...")
try: try:
dim_country = self.df_clean[['country']].drop_duplicates().copy() dim_country = self.df_clean[['country']].drop_duplicates().copy()
@@ -270,9 +270,7 @@ class DimensionalModelLoader:
lambda x: region_mapping.get(x, ('Unknown', 'Unknown'))[1]) lambda x: region_mapping.get(x, ('Unknown', 'Unknown'))[1])
dim_country['iso_code'] = dim_country['country_name'].map(iso_mapping) dim_country['iso_code'] = dim_country['country_name'].map(iso_mapping)
dim_country_final = dim_country[ dim_country_final = dim_country[['country_name', 'region', 'subregion', 'iso_code']].copy()
['country_name', 'region', 'subregion', 'iso_code']
].copy()
dim_country_final = dim_country_final.reset_index(drop=True) dim_country_final = dim_country_final.reset_index(drop=True)
dim_country_final.insert(0, 'country_id', range(1, len(dim_country_final) + 1)) dim_country_final.insert(0, 'country_id', range(1, len(dim_country_final) + 1))
@@ -295,7 +293,7 @@ class DimensionalModelLoader:
) )
log_update(self.client, 'DW', table_name, 'full_load', rows_loaded) log_update(self.client, 'DW', table_name, 'full_load', rows_loaded)
self._save_table_metadata(table_name) self._save_table_metadata(table_name)
self.logger.info(f" dim_country: {rows_loaded} rows\n") self.logger.info(f" dim_country: {rows_loaded} rows\n")
return rows_loaded return rows_loaded
except Exception as e: except Exception as e:
@@ -304,19 +302,9 @@ class DimensionalModelLoader:
raise raise
def load_dim_indicator(self): def load_dim_indicator(self):
"""
Load dim_indicator ke Gold layer.
Kolom yang dimuat:
indicator_id — surrogate key
indicator_name — nama standar indikator
indicator_category — kategori (Health & Nutrition, dll.)
unit — satuan ukuran
direction — higher_better / lower_better
"""
table_name = 'dim_indicator' table_name = 'dim_indicator'
self.load_metadata[table_name]['start_time'] = datetime.now() self.load_metadata[table_name]['start_time'] = datetime.now()
self.logger.info("Loading dim_indicator -> [DW/Gold] fs_asean_gold...") self.logger.info("Loading dim_indicator [DW/Gold] fs_asean_gold...")
try: try:
has_direction = 'direction' in self.df_clean.columns has_direction = 'direction' in self.df_clean.columns
@@ -326,7 +314,6 @@ class DimensionalModelLoader:
dim_indicator = self.df_clean[['indicator_standardized']].drop_duplicates().copy() dim_indicator = self.df_clean[['indicator_standardized']].drop_duplicates().copy()
dim_indicator.columns = ['indicator_name'] dim_indicator.columns = ['indicator_name']
# Unit
if has_unit: if has_unit:
unit_map = self.df_clean[['indicator_standardized', 'unit']].drop_duplicates() unit_map = self.df_clean[['indicator_standardized', 'unit']].drop_duplicates()
unit_map.columns = ['indicator_name', 'unit'] unit_map.columns = ['indicator_name', 'unit']
@@ -334,7 +321,6 @@ class DimensionalModelLoader:
else: else:
dim_indicator['unit'] = None dim_indicator['unit'] = None
# Direction
if has_direction: if has_direction:
dir_map = self.df_clean[['indicator_standardized', 'direction']].drop_duplicates() dir_map = self.df_clean[['indicator_standardized', 'direction']].drop_duplicates()
dir_map.columns = ['indicator_name', 'direction'] dir_map.columns = ['indicator_name', 'direction']
@@ -344,43 +330,30 @@ class DimensionalModelLoader:
dim_indicator['direction'] = 'higher_better' dim_indicator['direction'] = 'higher_better'
self.logger.warning(" [WARN] direction not found, default: higher_better") self.logger.warning(" [WARN] direction not found, default: higher_better")
# Indicator category
if has_category: if has_category:
cat_map = self.df_clean[ cat_map = self.df_clean[['indicator_standardized', 'indicator_category']].drop_duplicates()
['indicator_standardized', 'indicator_category']
].drop_duplicates()
cat_map.columns = ['indicator_name', 'indicator_category'] cat_map.columns = ['indicator_name', 'indicator_category']
dim_indicator = dim_indicator.merge(cat_map, on='indicator_name', how='left') dim_indicator = dim_indicator.merge(cat_map, on='indicator_name', how='left')
else: else:
def categorize_indicator(name): def categorize_indicator(name):
n = str(name).lower() n = str(name).lower()
if any(w in n for w in [ if any(w in n for w in ['undernourishment', 'malnutrition', 'stunting',
'undernourishment', 'malnutrition', 'stunting', 'wasting', 'anemia', 'food security', 'food insecure', 'hunger']):
'wasting', 'anemia', 'anaemia', 'food security',
'food insecure', 'hunger'
]):
return 'Health & Nutrition' return 'Health & Nutrition'
elif any(w in n for w in [ elif any(w in n for w in ['production', 'yield', 'cereal', 'crop',
'production', 'yield', 'cereal', 'crop', 'import dependency', 'share of dietary']):
'import dependency', 'share of dietary'
]):
return 'Agricultural Production' return 'Agricultural Production'
elif any(w in n for w in ['import', 'export', 'trade']): elif any(w in n for w in ['import', 'export', 'trade']):
return 'Trade' return 'Trade'
elif any(w in n for w in ['gdp', 'income', 'economic']): elif any(w in n for w in ['gdp', 'income', 'economic']):
return 'Economic' return 'Economic'
elif any(w in n for w in [ elif any(w in n for w in ['water', 'sanitation', 'infrastructure', 'rail']):
'water', 'sanitation', 'infrastructure', 'rail'
]):
return 'Infrastructure' return 'Infrastructure'
else: else:
return 'Supporting' return 'Other'
dim_indicator['indicator_category'] = dim_indicator['indicator_name'].apply( dim_indicator['indicator_category'] = dim_indicator['indicator_name'].apply(categorize_indicator)
categorize_indicator
)
dim_indicator = dim_indicator.drop_duplicates(subset=['indicator_name'], keep='first') dim_indicator = dim_indicator.drop_duplicates(subset=['indicator_name'], keep='first')
dim_indicator_final = dim_indicator[ dim_indicator_final = dim_indicator[
['indicator_name', 'indicator_category', 'unit', 'direction'] ['indicator_name', 'indicator_category', 'unit', 'direction']
].copy() ].copy()
@@ -401,22 +374,17 @@ class DimensionalModelLoader:
) )
self._add_primary_key(table_name, 'indicator_id') self._add_primary_key(table_name, 'indicator_id')
# Log distribusi for label, col in [('Categories', 'indicator_category'), ('Direction', 'direction')]:
for label, col in [
('Categories', 'indicator_category'),
('Direction', 'direction'),
]:
self.logger.info(f" {label}:") self.logger.info(f" {label}:")
for val, cnt in dim_indicator_final[col].value_counts().items(): for val, cnt in dim_indicator_final[col].value_counts().items():
pct = cnt / len(dim_indicator_final) * 100 self.logger.info(f" - {val}: {cnt} ({cnt/len(dim_indicator_final)*100:.1f}%)")
self.logger.info(f" - {val}: {cnt} ({pct:.1f}%)")
self.load_metadata[table_name].update( self.load_metadata[table_name].update(
{'rows_loaded': rows_loaded, 'status': 'success', 'end_time': datetime.now()} {'rows_loaded': rows_loaded, 'status': 'success', 'end_time': datetime.now()}
) )
log_update(self.client, 'DW', table_name, 'full_load', rows_loaded) log_update(self.client, 'DW', table_name, 'full_load', rows_loaded)
self._save_table_metadata(table_name) self._save_table_metadata(table_name)
self.logger.info(f" dim_indicator: {rows_loaded} rows\n") self.logger.info(f" dim_indicator: {rows_loaded} rows\n")
return rows_loaded return rows_loaded
except Exception as e: except Exception as e:
@@ -427,7 +395,7 @@ class DimensionalModelLoader:
def load_dim_source(self): def load_dim_source(self):
table_name = 'dim_source' table_name = 'dim_source'
self.load_metadata[table_name]['start_time'] = datetime.now() self.load_metadata[table_name]['start_time'] = datetime.now()
self.logger.info("Loading dim_source -> [DW/Gold] fs_asean_gold...") self.logger.info("Loading dim_source [DW/Gold] fs_asean_gold...")
try: try:
source_details = { source_details = {
@@ -487,7 +455,7 @@ class DimensionalModelLoader:
) )
log_update(self.client, 'DW', table_name, 'full_load', rows_loaded) log_update(self.client, 'DW', table_name, 'full_load', rows_loaded)
self._save_table_metadata(table_name) self._save_table_metadata(table_name)
self.logger.info(f" dim_source: {rows_loaded} rows\n") self.logger.info(f" dim_source: {rows_loaded} rows\n")
return rows_loaded return rows_loaded
except Exception as e: except Exception as e:
@@ -498,15 +466,15 @@ class DimensionalModelLoader:
def load_dim_pillar(self): def load_dim_pillar(self):
table_name = 'dim_pillar' table_name = 'dim_pillar'
self.load_metadata[table_name]['start_time'] = datetime.now() self.load_metadata[table_name]['start_time'] = datetime.now()
self.logger.info("Loading dim_pillar -> [DW/Gold] fs_asean_gold...") self.logger.info("Loading dim_pillar [DW/Gold] fs_asean_gold...")
try: try:
pillar_codes = { pillar_codes = {
'Availability': 'AVL', 'Access' : 'ACC', 'Availability': 'AVL', 'Access' : 'ACC',
'Utilization' : 'UTL', 'Stability': 'STB', 'Supporting': 'SPT', 'Utilization' : 'UTL', 'Stability': 'STB', 'Other': 'OTH',
} }
pillars_data = [ pillars_data = [
{'pillar_name': p, 'pillar_code': pillar_codes.get(p, 'SPT')} {'pillar_name': p, 'pillar_code': pillar_codes.get(p, 'OTH')}
for p in self.df_clean['pillar'].unique() for p in self.df_clean['pillar'].unique()
] ]
@@ -533,7 +501,7 @@ class DimensionalModelLoader:
) )
log_update(self.client, 'DW', table_name, 'full_load', rows_loaded) log_update(self.client, 'DW', table_name, 'full_load', rows_loaded)
self._save_table_metadata(table_name) self._save_table_metadata(table_name)
self.logger.info(f" dim_pillar: {rows_loaded} rows\n") self.logger.info(f" dim_pillar: {rows_loaded} rows\n")
return rows_loaded return rows_loaded
except Exception as e: except Exception as e:
@@ -548,9 +516,10 @@ class DimensionalModelLoader:
def load_fact_food_security(self): def load_fact_food_security(self):
table_name = 'fact_food_security' table_name = 'fact_food_security'
self.load_metadata[table_name]['start_time'] = datetime.now() self.load_metadata[table_name]['start_time'] = datetime.now()
self.logger.info("Loading fact_food_security -> [DW/Gold] fs_asean_gold...") self.logger.info("Loading fact_food_security [DW/Gold] fs_asean_gold...")
try: try:
# Load dims dari Gold untuk FK resolution
dim_country = read_from_bigquery(self.client, 'dim_country', layer='gold') dim_country = read_from_bigquery(self.client, 'dim_country', layer='gold')
dim_indicator = read_from_bigquery(self.client, 'dim_indicator', layer='gold') dim_indicator = read_from_bigquery(self.client, 'dim_indicator', layer='gold')
dim_time = read_from_bigquery(self.client, 'dim_time', layer='gold') dim_time = read_from_bigquery(self.client, 'dim_time', layer='gold')
@@ -592,9 +561,9 @@ class DimensionalModelLoader:
fact_table['start_year'] = fact_table['year'].astype(int) fact_table['start_year'] = fact_table['year'].astype(int)
fact_table['end_year'] = fact_table['year'].astype(int) fact_table['end_year'] = fact_table['year'].astype(int)
# Resolve FKs
fact_table = fact_table.merge( fact_table = fact_table.merge(
dim_country[['country_id', 'country_name']].rename( dim_country[['country_id', 'country_name']].rename(columns={'country_name': 'country'}),
columns={'country_name': 'country'}),
on='country', how='left' on='country', how='left'
) )
fact_table = fact_table.merge( fact_table = fact_table.merge(
@@ -607,16 +576,15 @@ class DimensionalModelLoader:
on=['start_year', 'end_year'], how='left' on=['start_year', 'end_year'], how='left'
) )
fact_table = fact_table.merge( fact_table = fact_table.merge(
dim_source[['source_id', 'source_name']].rename( dim_source[['source_id', 'source_name']].rename(columns={'source_name': 'source'}),
columns={'source_name': 'source'}),
on='source', how='left' on='source', how='left'
) )
fact_table = fact_table.merge( fact_table = fact_table.merge(
dim_pillar[['pillar_id', 'pillar_name']].rename( dim_pillar[['pillar_id', 'pillar_name']].rename(columns={'pillar_name': 'pillar'}),
columns={'pillar_name': 'pillar'}),
on='pillar', how='left' on='pillar', how='left'
) )
# Filter hanya row dengan FK lengkap
fact_table = fact_table[ fact_table = fact_table[
fact_table['country_id'].notna() & fact_table['country_id'].notna() &
fact_table['indicator_id'].notna() & fact_table['indicator_id'].notna() &
@@ -653,6 +621,7 @@ class DimensionalModelLoader:
layer='gold', write_disposition="WRITE_TRUNCATE", schema=schema layer='gold', write_disposition="WRITE_TRUNCATE", schema=schema
) )
# Add PK + FKs
self._add_primary_key(table_name, 'fact_id') self._add_primary_key(table_name, 'fact_id')
self._add_foreign_key(table_name, 'country_id', 'dim_country', 'country_id') self._add_foreign_key(table_name, 'country_id', 'dim_country', 'country_id')
self._add_foreign_key(table_name, 'indicator_id', 'dim_indicator', 'indicator_id') self._add_foreign_key(table_name, 'indicator_id', 'dim_indicator', 'indicator_id')
@@ -665,7 +634,7 @@ class DimensionalModelLoader:
) )
log_update(self.client, 'DW', table_name, 'full_load', rows_loaded) log_update(self.client, 'DW', table_name, 'full_load', rows_loaded)
self._save_table_metadata(table_name) self._save_table_metadata(table_name)
self.logger.info(f" fact_food_security: {rows_loaded:,} rows\n") self.logger.info(f" fact_food_security: {rows_loaded:,} rows\n")
return rows_loaded return rows_loaded
except Exception as e: except Exception as e:
@@ -748,15 +717,11 @@ class DimensionalModelLoader:
FROM `{get_table_id('dim_indicator', layer='gold')}` FROM `{get_table_id('dim_indicator', layer='gold')}`
GROUP BY direction ORDER BY direction GROUP BY direction ORDER BY direction
""" """
df_dir = self.client.query(query_dir).result().to_dataframe( df_dir = self.client.query(query_dir).result().to_dataframe(create_bqstorage_client=False)
create_bqstorage_client=False
)
if len(df_dir) > 0: if len(df_dir) > 0:
self.logger.info(f"\n Direction Distribution:") self.logger.info(f"\n Direction Distribution:")
for _, row in df_dir.iterrows(): for _, row in df_dir.iterrows():
self.logger.info( self.logger.info(f" {row['direction']:15s}: {int(row['count']):>5,} indicators")
f" {row['direction']:15s}: {int(row['count']):>5,} indicators"
)
self.logger.info("\n [OK] Validation completed") self.logger.info("\n [OK] Validation completed")
except Exception as e: except Exception as e:
@@ -773,19 +738,22 @@ class DimensionalModelLoader:
self.pipeline_metadata['rows_fetched'] = len(self.df_clean) self.pipeline_metadata['rows_fetched'] = len(self.df_clean)
self.logger.info("\n" + "=" * 60) self.logger.info("\n" + "=" * 60)
self.logger.info("DIMENSIONAL MODEL LOAD — DW (Gold) -> fs_asean_gold") self.logger.info("DIMENSIONAL MODEL LOAD — DW (Gold) fs_asean_gold")
self.logger.info("=" * 60) self.logger.info("=" * 60)
self.logger.info("\nLOADING DIMENSION TABLES -> fs_asean_gold") # Dimensions
self.logger.info("\nLOADING DIMENSION TABLES → fs_asean_gold")
self.load_dim_country() self.load_dim_country()
self.load_dim_indicator() self.load_dim_indicator()
self.load_dim_time() self.load_dim_time()
self.load_dim_source() self.load_dim_source()
self.load_dim_pillar() self.load_dim_pillar()
self.logger.info("\nLOADING FACT TABLE -> fs_asean_gold") # Fact
self.logger.info("\nLOADING FACT TABLE → fs_asean_gold")
self.load_fact_food_security() self.load_fact_food_security()
# Validate
self.validate_constraints() self.validate_constraints()
self.validate_data_load() self.validate_data_load()
@@ -801,9 +769,7 @@ class DimensionalModelLoader:
'execution_timestamp': self.pipeline_metadata['start_time'], 'execution_timestamp': self.pipeline_metadata['start_time'],
'completeness_pct' : 100.0, 'completeness_pct' : 100.0,
'config_snapshot' : json.dumps({'load_mode': 'full_refresh', 'layer': 'gold'}), 'config_snapshot' : json.dumps({'load_mode': 'full_refresh', 'layer': 'gold'}),
'validation_metrics' : json.dumps( 'validation_metrics': json.dumps({t: m['status'] for t, m in self.load_metadata.items()}),
{t: m['status'] for t, m in self.load_metadata.items()}
),
'table_name' : 'dimensional_model_pipeline', 'table_name' : 'dimensional_model_pipeline',
}) })
try: try:
@@ -811,6 +777,7 @@ class DimensionalModelLoader:
except Exception as e: except Exception as e:
self.logger.warning(f" [WARN] Could not save pipeline metadata: {e}") self.logger.warning(f" [WARN] Could not save pipeline metadata: {e}")
# Summary
self.logger.info("\n" + "=" * 60) self.logger.info("\n" + "=" * 60)
self.logger.info("DIMENSIONAL MODEL LOAD COMPLETED") self.logger.info("DIMENSIONAL MODEL LOAD COMPLETED")
self.logger.info("=" * 60) self.logger.info("=" * 60)
@@ -818,19 +785,20 @@ class DimensionalModelLoader:
self.logger.info(f" Duration : {duration:.2f}s") self.logger.info(f" Duration : {duration:.2f}s")
self.logger.info(f" Tables :") self.logger.info(f" Tables :")
for tbl, meta in self.load_metadata.items(): for tbl, meta in self.load_metadata.items():
icon = "OK" if meta['status'] == 'success' else "FAIL" icon = "" if meta['status'] == 'success' else ""
self.logger.info(f" [{icon}] {tbl:25s}: {meta['rows_loaded']:>10,} rows") self.logger.info(f" {icon} {tbl:25s}: {meta['rows_loaded']:>10,} rows")
self.logger.info(f"\n Metadata -> [AUDIT] etl_metadata") self.logger.info(f"\n Metadata [AUDIT] etl_metadata")
self.logger.info("=" * 60) self.logger.info("=" * 60)
# ============================================================================= # =============================================================================
# AIRFLOW TASK FUNCTIONS # AIRFLOW TASK FUNCTIONS ← sama polanya dengan raw & cleaned layer
# ============================================================================= # =============================================================================
def run_dimensional_model(): def run_dimensional_model():
""" """
Airflow task: Load dimensional model dari cleaned_integrated. Airflow task: Load dimensional model dari cleaned_integrated.
Dipanggil oleh DAG setelah task cleaned_integration_to_silver selesai. Dipanggil oleh DAG setelah task cleaned_integration_to_silver selesai.
""" """
from scripts.bigquery_config import get_bigquery_client from scripts.bigquery_config import get_bigquery_client
@@ -849,9 +817,9 @@ if __name__ == "__main__":
print("=" * 60) print("=" * 60)
print("BIGQUERY DIMENSIONAL MODEL LOAD") print("BIGQUERY DIMENSIONAL MODEL LOAD")
print("Kimball DW Architecture") print("Kimball DW Architecture")
print(" Input : STAGING (Silver) -> cleaned_integrated (fs_asean_silver)") print(" Input : STAGING (Silver) cleaned_integrated (fs_asean_silver)")
print(" Output : DW (Gold) -> dim_*, fact_* (fs_asean_gold)") print(" Output : DW (Gold) dim_*, fact_* (fs_asean_gold)")
print(" Audit : AUDIT -> etl_logs, etl_metadata (fs_asean_audit)") print(" Audit : AUDIT etl_logs, etl_metadata (fs_asean_audit)")
print("=" * 60) print("=" * 60)
logger = setup_logging() logger = setup_logging()
@@ -859,22 +827,24 @@ if __name__ == "__main__":
print("\nLoading cleaned_integrated (fs_asean_silver)...") print("\nLoading cleaned_integrated (fs_asean_silver)...")
df_clean = read_from_bigquery(client, 'cleaned_integrated', layer='silver') df_clean = read_from_bigquery(client, 'cleaned_integrated', layer='silver')
print(f" Loaded : {len(df_clean):,} rows") print(f" Loaded : {len(df_clean):,} rows")
print(f" Columns : {len(df_clean.columns)}") print(f" Columns : {len(df_clean.columns)}")
print(f" Sources : {df_clean['source'].nunique()}") print(f" Sources : {df_clean['source'].nunique()}")
print(f" Indicators : {df_clean['indicator_standardized'].nunique()}") print(f" Indicators : {df_clean['indicator_standardized'].nunique()}")
print(f" Countries : {df_clean['country'].nunique()}") print(f" Countries : {df_clean['country'].nunique()}")
print(f" Year range : {int(df_clean['year'].min())}-{int(df_clean['year'].max())}") print(f" Year range : {int(df_clean['year'].min())}{int(df_clean['year'].max())}")
if 'direction' in df_clean.columns: if 'direction' in df_clean.columns:
print(f" Direction : {df_clean['direction'].value_counts().to_dict()}") print(f" Direction : {df_clean['direction'].value_counts().to_dict()}")
else:
print(f" [WARN] direction column not found — run bigquery_cleaned_layer.py first")
print("\n[1/1] Dimensional Model Load -> DW (Gold)...") print("\n[1/1] Dimensional Model Load DW (Gold)...")
loader = DimensionalModelLoader(client, df_clean) loader = DimensionalModelLoader(client, df_clean)
loader.run() loader.run()
print("\n" + "=" * 60) print("\n" + "=" * 60)
print("[OK] DIMENSIONAL MODEL ETL COMPLETED") print(" DIMENSIONAL MODEL ETL COMPLETED")
print(" DW (Gold) : dim_country, dim_indicator, dim_time,") print(" 🥇 DW (Gold) : dim_country, dim_indicator, dim_time,")
print(" dim_source, dim_pillar, fact_food_security") print(" dim_source, dim_pillar, fact_food_security")
print(" AUDIT : etl_logs, etl_metadata") print(" 📋 AUDIT : etl_logs, etl_metadata")
print("=" * 60) print("=" * 60)