sdgs year v2

This commit is contained in:
Debby
2026-03-31 23:38:15 +07:00
parent 0d89c60b12
commit 8ae5018a62

View File

@@ -8,7 +8,7 @@ Filtering Order:
3. Filter complete indicators PER COUNTRY (auto-detect start year, no gaps) 3. Filter complete indicators PER COUNTRY (auto-detect start year, no gaps)
4. Filter countries with ALL pillars (FIXED SET) 4. Filter countries with ALL pillars (FIXED SET)
5. Filter indicators with consistent presence across FIXED countries 5. Filter indicators with consistent presence across FIXED countries
6. Determine SDG start year & assign framework (MDGs/SDGs) per ROW per year 6. Determine SDG start year & assign framework (MDGs/SDGs) per indicator PER ROW
7. Verify no gaps 7. Verify no gaps
8. Calculate norm_value_1_100 per indicator per country (min-max, direction-aware) 8. Calculate norm_value_1_100 per indicator per country (min-max, direction-aware)
9. Calculate YoY per indicator per country 9. Calculate YoY per indicator per country
@@ -22,18 +22,17 @@ NORMALISASI (Step 8):
sehingga nilai antar negara dan antar tahun tetap comparable sehingga nilai antar negara dan antar tahun tetap comparable
- Kolom ini memungkinkan perbandingan antar indikator yang berbeda satuan di Looker Studio - Kolom ini memungkinkan perbandingan antar indikator yang berbeda satuan di Looker Studio
FRAMEWORK LOGIC (Per-Row, bukan per indikator): FRAMEWORK LOGIC (FIX - Row-Level Assignment):
- sdg_start_year dideteksi dari data: tahun pertama indikator FIES lengkap - SDG start year dideteksi dari data: tahun pertama indikator FIES/anaemia lengkap
di semua fixed countries (setelah Step 3-5 filter selesai) di semua fixed countries (setelah Step 3-5 filter selesai)
- Proxy deteksi sdg_start_year: HANYA FIES ("food insecurity", "food insecure") - Framework di-assign PER BARIS (per tahun), bukan per indikator:
Anemia TIDAK dipakai sebagai proxy karena datanya sudah ada sebelum era SDGs * Jika row['year'] < sdg_start_year -> selalu 'MDGs'
- Framework di-assign PER BARIS (per year), bukan per indikator: * Jika row['year'] >= sdg_start_year DAN
* row['year'] >= sdg_start_year AND nama ada di SDG_INDICATOR_KEYWORDS -> 'SDGs' nama ada di SDG_INDICATOR_KEYWORDS -> 'SDGs'
* Selain itu -> 'MDGs' * Selain itu -> 'MDGs'
- Ini menangani indikator "shared" (anemia, stunting, wasting, undernourishment) - Dengan demikian, indikator seperti "Prevalence of anemia" yang datanya dimulai
yang datanya ada sebelum SDGs: sebelum era SDGs akan berlabel 'MDGs' untuk tahun-tahun pra-SDGs dan 'SDGs'
* row lama (year < sdg_start_year) -> 'MDGs' untuk tahun-tahun pasca (>= sdg_start_year).
* row baru (year >= sdg_start_year) -> 'SDGs'
""" """
import pandas as pd import pandas as pd
@@ -61,16 +60,13 @@ from google.cloud import bigquery
# ============================================================================= # =============================================================================
# SDG INDICATOR KEYWORDS # SDG INDICATOR KEYWORDS
# Daftar nama indikator (lowercase) yang masuk SDG framework.
# Indikator ini akan di-assign 'SDGs' untuk baris dengan year >= sdg_start_year,
# dan 'MDGs' untuk baris dengan year < sdg_start_year.
# ============================================================================= # =============================================================================
SDG_INDICATOR_KEYWORDS = frozenset([ SDG_INDICATOR_KEYWORDS = frozenset([
# TARGET 2.1.1 — Prevalence of undernourishment (shared: ada sebelum SDGs) # TARGET 2.1.1 — Prevalence of undernourishment (shared, sudah ada sebelum SDGs)
"prevalence of undernourishment (percent) (3-year average)", "prevalence of undernourishment (percent) (3-year average)",
"number of people undernourished (million) (3-year average)", "number of people undernourished (million) (3-year average)",
# TARGET 2.1.2 — FIES (SDGs only — murni baru di era SDGs) # TARGET 2.1.2 — FIES (SDGs only)
"prevalence of severe food insecurity in the total population (percent) (3-year average)", "prevalence of severe food insecurity in the total population (percent) (3-year average)",
"prevalence of severe food insecurity in the male adult population (percent) (3-year average)", "prevalence of severe food insecurity in the male adult population (percent) (3-year average)",
"prevalence of severe food insecurity in the female adult population (percent) (3-year average)", "prevalence of severe food insecurity in the female adult population (percent) (3-year average)",
@@ -83,35 +79,24 @@ SDG_INDICATOR_KEYWORDS = frozenset([
"number of moderately or severely food insecure people (million) (3-year average)", "number of moderately or severely food insecure people (million) (3-year average)",
"number of moderately or severely food insecure male adults (million) (3-year average)", "number of moderately or severely food insecure male adults (million) (3-year average)",
"number of moderately or severely food insecure female adults (million) (3-year average)", "number of moderately or severely food insecure female adults (million) (3-year average)",
# TARGET 2.2.1 — Stunting (shared: ada sebelum SDGs) # TARGET 2.2.1 — Stunting (shared)
"percentage of children under 5 years of age who are stunted (modelled estimates) (percent)", "percentage of children under 5 years of age who are stunted (modelled estimates) (percent)",
"number of children under 5 years of age who are stunted (modeled estimates) (million)", "number of children under 5 years of age who are stunted (modeled estimates) (million)",
# TARGET 2.2.2 — Wasting & Overweight (shared: ada sebelum SDGs) # TARGET 2.2.2 — Wasting & Overweight (shared)
"percentage of children under 5 years affected by wasting (percent)", "percentage of children under 5 years affected by wasting (percent)",
"number of children under 5 years affected by wasting (million)", "number of children under 5 years affected by wasting (million)",
"percentage of children under 5 years of age who are overweight (modelled estimates) (percent)", "percentage of children under 5 years of age who are overweight (modelled estimates) (percent)",
"number of children under 5 years of age who are overweight (modeled estimates) (million)", "number of children under 5 years of age who are overweight (modeled estimates) (million)",
# TARGET 2.2.3 — Anaemia (shared: data ada sebelum SDGs, listed here agar # TARGET 2.2.3 — Anaemia (SDGs only — listed here so rows >= sdg_start_year become SDGs)
# baris >= sdg_start_year di-assign 'SDGs')
"prevalence of anemia among women of reproductive age (15-49 years) (percent)", "prevalence of anemia among women of reproductive age (15-49 years) (percent)",
"number of women of reproductive age (15-49 years) affected by anemia (million)", "number of women of reproductive age (15-49 years) affected by anemia (million)",
]) ])
# ============================================================================= # Proxy keywords untuk deteksi era SDGs dari data (indikator murni baru di SDGs)
# SDG ERA PROXY KEYWORDS
# HANYA indikator yang MURNI baru di era SDGs (FIES saja).
# Dipakai untuk mendeteksi sdg_start_year dari data.
#
# PENTING — Anemia/anaemia TIDAK dipakai sebagai proxy:
# Data anemia sudah ada sebelum era SDGs sehingga actual_start_year-nya
# lebih awal dari sdg_start_year. Jika dipakai sebagai proxy, sdg_start_year
# akan terdeteksi terlalu awal dan seluruh baris anemia akan menjadi 'SDGs'.
# FIES adalah satu-satunya indikator yang benar-benar murni baru di era SDGs
# dan dapat dipakai sebagai penanda tahun mulainya era SDGs.
# =============================================================================
_SDG_ERA_PROXY_KEYWORDS = frozenset([ _SDG_ERA_PROXY_KEYWORDS = frozenset([
"food insecurity", "food insecurity",
"food insecure", "anemia",
"anaemia",
]) ])
# ============================================================================= # =============================================================================
@@ -119,8 +104,8 @@ _SDG_ERA_PROXY_KEYWORDS = frozenset([
# ============================================================================= # =============================================================================
# Digunakan untuk assign kondisi di analysis_layer. # Digunakan untuk assign kondisi di analysis_layer.
# Didefinisikan di sini agar konsisten antara kedua file. # Didefinisikan di sini agar konsisten antara kedua file.
# bad : norm_value_1_100 < THRESHOLD_BAD # bad : norm_value_1_100 < THRESHOLD_BAD
# good : norm_value_1_100 > THRESHOLD_GOOD # good : norm_value_1_100 > THRESHOLD_GOOD
# moderate : di antara keduanya # moderate : di antara keduanya
THRESHOLD_BAD = 40.0 THRESHOLD_BAD = 40.0
@@ -143,41 +128,33 @@ def assign_condition(norm_value_1_100: float) -> str:
return 'moderate' return 'moderate'
def assign_framework_per_row( def assign_framework_for_row(
indicator_name: str, indicator_name: str,
year: int, row_year: int,
sdg_start_year: int, sdg_start_year: int,
) -> str: ) -> str:
""" """
Tentukan framework (MDGs/SDGs) per BARIS (per row year), bukan per indikator. Tentukan framework (MDGs/SDGs) PER BARIS (per tahun), bukan per indikator.
Logic: Logic:
- 'SDGs' jika KEDUA kondisi terpenuhi: - Jika row_year < sdg_start_year → selalu 'MDGs', apapun nama indikatornya.
1. Nama indikator ada di SDG_INDICATOR_KEYWORDS - Jika row_year >= sdg_start_year DAN nama ada di SDG_INDICATOR_KEYWORDS'SDGs'.
2. year (tahun baris ini) >= sdg_start_year - Selain itu → 'MDGs'.
- 'MDGs' untuk semua kasus lain.
Mengapa per row, bukan per indikator? Dengan cara ini, indikator seperti "Prevalence of anemia" yang datanya
Indikator "shared" seperti anemia, stunting, wasting, undernourishment ada sebelum era SDGs akan berlabel 'MDGs' untuk tahun-tahun pra-SDGs,
memiliki data yang ada SEBELUM era SDGs dimulai. Jika assign dilakukan dan 'SDGs' untuk tahun-tahun pasca sdg_start_year.
per indikator menggunakan actual_start_year, indikator-indikator ini
akan selalu di-assign 'MDGs' karena actual_start_year < sdg_start_year.
Dengan assign per row menggunakan year baris:
- baris lama (year < sdg_start_year) -> 'MDGs' (benar: belum era SDGs)
- baris baru (year >= sdg_start_year) -> 'SDGs' (benar: sudah era SDGs)
Contoh anemia (sdg_start_year = 2016):
- row year=2013 -> 'MDGs'
- row year=2014 -> 'MDGs'
- row year=2015 -> 'MDGs'
- row year=2016 -> 'SDGs'
- row year=2017 -> 'SDGs'
- ...
""" """
name_lower = str(indicator_name).lower().strip() # Tahun sebelum era SDGs → selalu MDGs
in_sdg_list = name_lower in SDG_INDICATOR_KEYWORDS if row_year < sdg_start_year:
if in_sdg_list and int(year) >= sdg_start_year: return 'MDGs'
# Tahun >= sdg_start_year: cek apakah nama ada di SDG list
name_lower = str(indicator_name).lower().strip()
if name_lower in SDG_INDICATOR_KEYWORDS:
return 'SDGs' return 'SDGs'
# Tidak ada di SDG list → MDGs
return 'MDGs' return 'MDGs'
@@ -197,11 +174,10 @@ class AnalyticalLayerLoader:
norm_value_1_100, <- min-max norm per indikator, skala 1-100, direction-aware norm_value_1_100, <- min-max norm per indikator, skala 1-100, direction-aware
yoy_change, yoy_pct yoy_change, yoy_pct
Catatan framework: PERUBAHAN (framework fix):
Framework di-assign PER BARIS (per year), sehingga indikator shared - framework di-assign per baris (per tahun), bukan per indikator.
seperti anemia dapat memiliki framework berbeda di baris yang berbeda: - Baris dengan year < sdg_start_year selalu 'MDGs'.
- baris sebelum sdg_start_year -> 'MDGs' - Baris dengan year >= sdg_start_year dan nama di SDG_INDICATOR_KEYWORDS → 'SDGs'.
- baris sejak sdg_start_year -> 'SDGs'
""" """
def __init__(self, client: bigquery.Client): def __init__(self, client: bigquery.Client):
@@ -306,14 +282,6 @@ class AnalyticalLayerLoader:
self.logger.info("STEP 2: DETERMINE YEAR BOUNDARIES") self.logger.info("STEP 2: DETERMINE YEAR BOUNDARIES")
self.logger.info("=" * 80) self.logger.info("=" * 80)
# Filter single years only (is_year_range == False)
if 'is_year_range' in self.df_clean.columns:
before = len(self.df_clean)
self.df_clean = self.df_clean[self.df_clean['is_year_range'] == False].copy()
self.logger.info(
f" Filter single years only: {before:,} -> {len(self.df_clean):,} rows"
)
# baseline_year = 2023 hardcode (syarat dosen: minimal 2023) # baseline_year = 2023 hardcode (syarat dosen: minimal 2023)
df_baseline = self.df_clean[self.df_clean['year'] == self.baseline_year] df_baseline = self.df_clean[self.df_clean['year'] == self.baseline_year]
baseline_indicator_count = df_baseline['indicator_id'].nunique() baseline_indicator_count = df_baseline['indicator_id'].nunique()
@@ -529,11 +497,6 @@ class AnalyticalLayerLoader:
self.logger.info(f"\n [+] Valid: {len(valid_indicators)}") self.logger.info(f"\n [+] Valid: {len(valid_indicators)}")
self.logger.info(f" [-] Removed: {len(removed_indicators)}") self.logger.info(f" [-] Removed: {len(removed_indicators)}")
if removed_indicators:
self.logger.info(f"\n Removed indicators:")
for item in removed_indicators:
self.logger.info(f" [-] {item['indicator_name'][:60]} | {item['reason']}")
if not valid_indicators: if not valid_indicators:
raise ValueError("No valid indicators found after filtering!") raise ValueError("No valid indicators found after filtering!")
@@ -559,18 +522,13 @@ class AnalyticalLayerLoader:
return self.df_clean return self.df_clean
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# STEP 6: DETERMINE SDG START YEAR & ASSIGN FRAMEWORK PER ROW # STEP 6: DETERMINE SDG START YEAR & ASSIGN FRAMEWORK (ROW-LEVEL FIX)
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def determine_sdg_start_year(self): def determine_sdg_start_year(self):
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 6: DETERMINE SDG START YEAR & ASSIGN FRAMEWORK PER ROW") self.logger.info("STEP 6: DETERMINE SDG START YEAR & ASSIGN FRAMEWORK (ROW-LEVEL)")
self.logger.info("=" * 80) self.logger.info("=" * 80)
self.logger.info(
" Proxy: FIES only (food insecurity/food insecure).\n"
" Anemia TIDAK dipakai sebagai proxy — datanya ada sebelum era SDGs.\n"
" Framework di-assign PER BARIS (year), bukan per indikator."
)
# actual_start_year per indikator = max(min_year per country) # actual_start_year per indikator = max(min_year per country)
# = konsisten dengan max_start_year di Step 5 # = konsisten dengan max_start_year di Step 5
@@ -583,7 +541,7 @@ class AnalyticalLayerLoader:
) )
indicator_actual_start.columns = ['indicator_id', 'indicator_name', 'actual_start_year'] indicator_actual_start.columns = ['indicator_id', 'indicator_name', 'actual_start_year']
# Deteksi sdg_start_year dari proxy SDGs-only (FIES saja, BUKAN anemia) # Deteksi sdg_start_year dari proxy SDGs-only (FIES & anaemia)
proxy_mask = indicator_actual_start['indicator_name'].str.lower().apply( proxy_mask = indicator_actual_start['indicator_name'].str.lower().apply(
lambda n: any(kw in n for kw in _SDG_ERA_PROXY_KEYWORDS) lambda n: any(kw in n for kw in _SDG_ERA_PROXY_KEYWORDS)
) )
@@ -591,78 +549,71 @@ class AnalyticalLayerLoader:
if df_proxy.empty: if df_proxy.empty:
raise ValueError( raise ValueError(
"Tidak ada indikator proxy SDGs (FIES) yang lolos filter. " "Tidak ada indikator proxy SDGs (FIES/anaemia) yang lolos filter. "
"Pastikan indikator FIES (food insecurity/food insecure) ada di data." "Pastikan indikator FIES dan anaemia ada di data."
) )
self.sdg_start_year = int(df_proxy['actual_start_year'].min()) self.sdg_start_year = int(df_proxy['actual_start_year'].min())
self.logger.info(f"\n sdg_start_year = {self.sdg_start_year}") self.logger.info(f"\n sdg_start_year = {self.sdg_start_year}")
self.logger.info(f" Proxy indicators (FIES only):") self.logger.info(f" Proxy indicators (penentu sdg_start_year):")
for _, row in df_proxy.iterrows(): for _, row in df_proxy.iterrows():
self.logger.info(f" [{int(row['actual_start_year'])}] {row['indicator_name']}") self.logger.info(f" [{int(row['actual_start_year'])}] {row['indicator_name']}")
# ---------------------------------------------------------------- # ------------------------------------------------------------------
# Assign framework PER BARIS menggunakan year baris, bukan actual_start_year # FIX: Assign framework PER BARIS (per tahun), bukan per indikator
# Sehingga indikator "shared" (anemia, stunting, dll) mendapat: # ------------------------------------------------------------------
# - 'MDGs' untuk baris sebelum sdg_start_year # Logic:
# - 'SDGs' untuk baris sejak sdg_start_year # row['year'] < sdg_start_year → 'MDGs' (apapun nama indikatornya)
# ---------------------------------------------------------------- # row['year'] >= sdg_start_year + nama di SDG_INDICATOR_KEYWORDS → 'SDGs'
# selain itu → 'MDGs'
# ------------------------------------------------------------------
self.logger.info(f"\n Assigning framework PER ROW (year-level)...")
self.logger.info(f" Rule: year < {self.sdg_start_year} → MDGs (always)")
self.logger.info(f" Rule: year >= {self.sdg_start_year} + name in SDG list → SDGs")
self.logger.info(f" Rule: year >= {self.sdg_start_year} + name NOT in SDG list → MDGs")
self.df_clean['framework'] = self.df_clean.apply( self.df_clean['framework'] = self.df_clean.apply(
lambda row: assign_framework_per_row( lambda row: assign_framework_for_row(
indicator_name = row['indicator_name'], indicator_name = row['indicator_name'],
year = int(row['year']), row_year = int(row['year']),
sdg_start_year = self.sdg_start_year, sdg_start_year = self.sdg_start_year,
), ),
axis=1 axis=1
) )
# ---------------------------------------------------------------- # Log ringkasan per indikator untuk verifikasi
# Logging: ringkasan per indikator (frameworks apa yang muncul) self.logger.info(f"\n {'Framework Assignment per Indicator (sample)':}")
# ---------------------------------------------------------------- self.logger.info(f" {'-'*95}")
ind_fw_summary = ( self.logger.info(
self.df_clean f" {'ID':<5} {'Indicator Name':<50} "
.groupby(['indicator_id', 'indicator_name'])['framework'] f"{'Pre-SDG rows':<15} {'MDGs rows':<12} {'SDGs rows'}"
.unique()
.reset_index()
)
ind_fw_summary['frameworks'] = ind_fw_summary['framework'].apply(
lambda x: '/'.join(sorted(x))
)
ind_fw_summary = ind_fw_summary.merge(
indicator_actual_start[['indicator_id', 'actual_start_year']],
on='indicator_id', how='left'
) )
self.logger.info(f" {'-'*95}")
self.logger.info(f"\n Framework assignment per indikator:") for ind_id, grp in self.df_clean.groupby('indicator_id'):
self.logger.info(f" {'-'*85}") ind_name = grp['indicator_name'].iloc[0]
self.logger.info(f" {'ID':<5} {'Frameworks':<18} {'ActualStart':<13} {'Indicator Name'}") pre_sdg = (grp['year'] < self.sdg_start_year).sum()
self.logger.info(f" {'-'*85}") mdgs_rows = (grp['framework'] == 'MDGs').sum()
for _, row in ind_fw_summary.sort_values( sdgs_rows = (grp['framework'] == 'SDGs').sum()
['frameworks', 'actual_start_year', 'indicator_name']
).iterrows():
self.logger.info( self.logger.info(
f" {int(row['indicator_id']):<5} {row['frameworks']:<18} " f" {int(ind_id):<5} {ind_name[:48]:<50} "
f"{int(row['actual_start_year']):<13} {row['indicator_name'][:48]}" f"{pre_sdg:<15} {mdgs_rows:<12} {sdgs_rows}"
) )
# Indikator dengan framework split (MDGs/SDGs) — highlight untuk validasi
split_inds = ind_fw_summary[ind_fw_summary['frameworks'] == 'MDGs/SDGs']
if not split_inds.empty:
self.logger.info(
f"\n [INFO] {len(split_inds)} indikator memiliki framework split "
f"(MDGs sebelum {self.sdg_start_year}, SDGs sejak {self.sdg_start_year}):"
)
for _, row in split_inds.iterrows():
self.logger.info(f" - {row['indicator_name'][:60]}")
fw_summary = self.df_clean['framework'].value_counts() fw_summary = self.df_clean['framework'].value_counts()
self.logger.info( self.logger.info(f"\n Ringkasan rows: " + " | ".join(
f"\n Ringkasan rows: " + f"{fw}: {cnt:,}" for fw, cnt in fw_summary.items()
" | ".join(f"{fw}: {cnt:,}" for fw, cnt in fw_summary.items()) ))
)
# Ringkasan unique indicators per framework di tahun terbaru (end_year)
end_year_df = self.df_clean[self.df_clean['year'] == self.end_year]
fw_ind_summary = end_year_df.groupby('framework')['indicator_id'].nunique()
self.logger.info(f" Indicators di year={self.end_year}: " + " | ".join(
f"{fw}: {cnt}" for fw, cnt in fw_ind_summary.items()
))
self.logger.info( self.logger.info(
f"\n [OK] 'framework' ditambahkan per row" f"\n [OK] 'framework' ditambahkan (row-level)"
f"MDGs: {(self.df_clean['framework'] == 'MDGs').sum():,} rows | " f"MDGs: {(self.df_clean['framework'] == 'MDGs').sum():,} rows | "
f"SDGs: {(self.df_clean['framework'] == 'SDGs').sum():,} rows" f"SDGs: {(self.df_clean['framework'] == 'SDGs').sum():,} rows"
) )
@@ -700,7 +651,7 @@ class AnalyticalLayerLoader:
return True return True
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# STEP 8: CALCULATE NORM_VALUE_1_100 PER INDICATOR # STEP 8: CALCULATE NORM_VALUE_1_100 PER INDICATOR PER COUNTRY
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def calculate_norm_value(self): def calculate_norm_value(self):
@@ -731,7 +682,7 @@ class AnalyticalLayerLoader:
"negative", "lower_better", "lower_is_better", "inverse", "neg", "negative", "lower_better", "lower_is_better", "inverse", "neg",
}) })
df = self.df_clean.copy() df = self.df_clean.copy()
norm_parts = [] norm_parts = []
indicators = df.groupby(['indicator_id', 'indicator_name', 'direction']) indicators = df.groupby(['indicator_id', 'indicator_name', 'direction'])
@@ -862,45 +813,40 @@ class AnalyticalLayerLoader:
'start_year', 'end_year', 'country_count' 'start_year', 'end_year', 'country_count'
] ]
# Framework summary per indikator (bisa MDGs, SDGs, atau MDGs/SDGs split) # Framework per indikator di end_year (untuk display — representasi terbaru)
ind_fw = ( fw_at_end = (
self.df_clean self.df_clean[self.df_clean['year'] == self.end_year]
.groupby('indicator_id')['framework'] .groupby('indicator_id')['framework']
.unique() .first()
.reset_index() .reset_index()
) )
ind_fw['framework_label'] = ind_fw['framework'].apply( indicator_details = indicator_details.merge(fw_at_end, on='indicator_id', how='left')
lambda x: '/'.join(sorted(x)) indicator_details['framework'] = indicator_details['framework'].fillna('MDGs')
)
indicator_details = indicator_details.merge(
ind_fw[['indicator_id', 'framework_label']],
on='indicator_id', how='left'
)
indicator_details['year_range'] = ( indicator_details['year_range'] = (
indicator_details['start_year'].astype(int).astype(str) + '-' + indicator_details['start_year'].astype(int).astype(str) + '-' +
indicator_details['end_year'].astype(int).astype(str) indicator_details['end_year'].astype(int).astype(str)
) )
indicator_details = indicator_details.sort_values( indicator_details = indicator_details.sort_values(
['framework_label', 'pillar_name', 'start_year', 'indicator_name'] ['framework', 'pillar_name', 'start_year', 'indicator_name']
) )
self.logger.info(f"\nTotal Indicators: {len(indicator_details)}") self.logger.info(f"\nTotal Indicators: {len(indicator_details)}")
self.logger.info(f"Framework breakdown (per indicator label):") self.logger.info(f"Framework breakdown (at end_year={self.end_year}):")
for fw, count in indicator_details.groupby('framework_label').size().items(): for fw, count in indicator_details.groupby('framework').size().items():
self.logger.info(f" {fw}: {count} indicators") self.logger.info(f" {fw}: {count} indicators")
self.logger.info(f"\n{'-'*115}") self.logger.info(f"\n{'-'*110}")
self.logger.info( self.logger.info(
f"{'ID':<5} {'Indicator Name':<55} {'Pillar':<15} " f"{'ID':<5} {'Indicator Name':<55} {'Pillar':<15} "
f"{'Framework':<15} {'Years':<12} {'Dir':<8} {'Countries'}" f"{'Framework':<10} {'Years':<12} {'Dir':<8} {'Countries'}"
) )
self.logger.info(f"{'-'*115}") self.logger.info(f"{'-'*110}")
for _, row in indicator_details.iterrows(): for _, row in indicator_details.iterrows():
direction = 'higher+' if row['direction'] == 'higher_better' else 'lower-' direction = 'higher+' if row['direction'] == 'higher_better' else 'lower-'
self.logger.info( self.logger.info(
f"{int(row['indicator_id']):<5} {row['indicator_name'][:52]:<55} " f"{int(row['indicator_id']):<5} {row['indicator_name'][:52]:<55} "
f"{row['pillar_name'][:13]:<15} {row['framework_label']:<15} " f"{row['pillar_name'][:13]:<15} {row['framework']:<10} "
f"{row['year_range']:<12} {direction:<8} {int(row['country_count'])}" f"{row['year_range']:<12} {direction:<8} {int(row['country_count'])}"
) )
@@ -969,16 +915,14 @@ class AnalyticalLayerLoader:
for fw, cnt in fw_dist_rows.items(): for fw, cnt in fw_dist_rows.items():
self.logger.info(f" {fw}: {cnt:,} rows") self.logger.info(f" {fw}: {cnt:,} rows")
# Framework distribution per indikator (label) # Framework distribution per unique indicator (at end_year)
ind_fw_label = ( fw_dist_ind = (
analytical_df analytical_df[analytical_df['year'] == self.end_year]
.groupby('indicator_id')['framework'] .drop_duplicates('indicator_id')['framework']
.unique()
.apply(lambda x: '/'.join(sorted(x)))
.value_counts() .value_counts()
) )
self.logger.info(f" Framework distribution (per indicator label):") self.logger.info(f" Framework distribution (indicators at year={self.end_year}):")
for fw, cnt in ind_fw_label.items(): for fw, cnt in fw_dist_ind.items():
self.logger.info(f" {fw}: {cnt} indicators") self.logger.info(f" {fw}: {cnt} indicators")
self.logger.info( self.logger.info(
@@ -1022,26 +966,24 @@ class AnalyticalLayerLoader:
'rows_loaded' : rows_loaded, 'rows_loaded' : rows_loaded,
'completeness_pct' : 100.0, 'completeness_pct' : 100.0,
'config_snapshot' : json.dumps({ 'config_snapshot' : json.dumps({
'start_year' : self.start_year, 'start_year' : self.start_year,
'end_year' : self.end_year, 'end_year' : self.end_year,
'baseline_year' : self.baseline_year, 'baseline_year' : self.baseline_year,
'sdg_start_year' : self.sdg_start_year, 'sdg_start_year' : self.sdg_start_year,
'fixed_countries' : len(self.selected_country_ids), 'fixed_countries' : len(self.selected_country_ids),
'norm_scale' : '1-100 per indicator global minmax direction-aware', 'norm_scale' : '1-100 per indicator global minmax direction-aware',
'framework_assignment' : 'per-row by year (not per-indicator)', 'framework_logic' : 'row-level: year < sdg_start_year → MDGs always',
'sdg_proxy_keywords' : list(_SDG_ERA_PROXY_KEYWORDS), 'condition_thresholds': {
'condition_thresholds' : {
'bad' : f'< {THRESHOLD_BAD}', 'bad' : f'< {THRESHOLD_BAD}',
'moderate': f'{THRESHOLD_BAD}-{THRESHOLD_GOOD}', 'moderate': f'{THRESHOLD_BAD}-{THRESHOLD_GOOD}',
'good' : f'> {THRESHOLD_GOOD}', 'good' : f'> {THRESHOLD_GOOD}',
}, },
}), }),
'validation_metrics' : json.dumps({ 'validation_metrics' : json.dumps({
'fixed_countries' : len(self.selected_country_ids), 'fixed_countries' : len(self.selected_country_ids),
'total_indicators' : int(self.df_clean['indicator_id'].nunique()), 'total_indicators': int(self.df_clean['indicator_id'].nunique()),
'sdg_start_year' : self.sdg_start_year, 'sdg_start_year' : self.sdg_start_year,
'framework_dist_rows' : fw_dist_rows.to_dict(), 'framework_dist_rows': fw_dist_rows.to_dict(),
'framework_dist_inds' : ind_fw_label.to_dict(),
}) })
} }
save_etl_metadata(self.client, metadata) save_etl_metadata(self.client, metadata)
@@ -1064,9 +1006,8 @@ class AnalyticalLayerLoader:
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("Output: fact_asean_food_security_selected -> fs_asean_gold") self.logger.info("Output: fact_asean_food_security_selected -> fs_asean_gold")
self.logger.info("Kolom baru: norm_value_1_100 (min-max 1-100, direction-aware)") self.logger.info("Kolom baru: norm_value_1_100 (min-max 1-100, direction-aware)")
self.logger.info("Framework: per-row by year (shared indicators split MDGs/SDGs)")
self.logger.info(f"SDG Proxy: FIES only (food insecurity/food insecure)")
self.logger.info(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}") self.logger.info(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
self.logger.info("Framework: row-level (year < sdg_start_year → MDGs always)")
self.logger.info("=" * 80) self.logger.info("=" * 80)
self.load_source_data() self.load_source_data()
@@ -1074,10 +1015,10 @@ class AnalyticalLayerLoader:
self.filter_complete_indicators_per_country() self.filter_complete_indicators_per_country()
self.select_countries_with_all_pillars() self.select_countries_with_all_pillars()
self.filter_indicators_consistent_across_fixed_countries() self.filter_indicators_consistent_across_fixed_countries()
self.determine_sdg_start_year() # Step 6: per-row framework assignment self.determine_sdg_start_year()
self.verify_no_gaps() self.verify_no_gaps()
self.calculate_norm_value() # Step 8: norm_value_1_100 self.calculate_norm_value() # Step 8: norm_value_1_100
self.calculate_yoy() # Step 9: yoy_change, yoy_pct self.calculate_yoy() # Step 9: yoy_change, yoy_pct
self.analyze_indicator_availability_by_year() self.analyze_indicator_availability_by_year()
self.save_analytical_table() self.save_analytical_table()
@@ -1116,8 +1057,8 @@ if __name__ == "__main__":
print("BIGQUERY ANALYTICAL LAYER - DATA FILTERING") print("BIGQUERY ANALYTICAL LAYER - DATA FILTERING")
print("Output: fact_asean_food_security_selected -> fs_asean_gold") print("Output: fact_asean_food_security_selected -> fs_asean_gold")
print(f"Norm: min-max 1-100 per indicator, direction-aware") print(f"Norm: min-max 1-100 per indicator, direction-aware")
print(f"Framework: per-row by year | SDG Proxy: FIES only")
print(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}") print(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
print(f"Framework: row-level (year < sdg_start_year → MDGs always)")
print("=" * 80) print("=" * 80)
logger = setup_logging() logger = setup_logging()