This commit is contained in:
Debby
2026-04-01 15:58:59 +07:00
parent 6a55a91112
commit c3b7674001

View File

@@ -19,31 +19,21 @@ NORMALISASI (Step 8):
- norm_value_1_100 = min-max normalisasi nilai raw per indikator, skala 1-100 - norm_value_1_100 = min-max normalisasi nilai raw per indikator, skala 1-100
- Direction-aware: lower_better diinvert sehingga nilai tinggi selalu = lebih baik - Direction-aware: lower_better diinvert sehingga nilai tinggi selalu = lebih baik
- Normalisasi dilakukan GLOBAL per indikator (semua negara, semua tahun sekaligus) - Normalisasi dilakukan GLOBAL per indikator (semua negara, semua tahun sekaligus)
sehingga nilai antar negara dan antar tahun tetap comparable
- Kolom ini memungkinkan perbandingan antar indikator yang berbeda satuan di Looker Studio
FRAMEWORK LOGIC (Per-Row, threshold = sdg_start_year global): FRAMEWORK LOGIC (Per-Row, bukan per indikator):
- sdg_start_year dideteksi dari data: tahun pertama indikator FIES lengkap
sdg_start_year dideteksi HANYA dari FIES ("food insecurity" / "food insecure"), di semua fixed countries (setelah Step 3-5 filter selesai)
karena FIES adalah satu-satunya indikator yang murni baru di era SDGs. - Proxy deteksi sdg_start_year: HANYA FIES ("food insecurity", "food insecure")
Anemia, stunting, wasting, undernourishment TIDAK dipakai sebagai proxy Anemia TIDAK dipakai sebagai proxy karena datanya sudah ada sebelum era SDGs
karena data mereka sudah ada sebelum SDGs sehingga actual_start < sdg_start. - Framework di-assign PER BARIS (per year), bukan per indikator:
* row['year'] >= sdg_start_year AND nama ada di SDG_INDICATOR_KEYWORDS -> 'SDGs'
Framework di-assign PER BARIS menggunakan sdg_start_year global: * Selain itu -> 'MDGs'
- Indikator ada di SDG_INDICATOR_KEYWORDS AND year >= sdg_start_year -> 'SDGs' - Ini menangani indikator "shared" (anemia, stunting, wasting, undernourishment)
- Selain itu -> 'MDGs' yang datanya ada sebelum SDGs:
* row lama (year < sdg_start_year) -> 'MDGs'
Efek per kategori indikator (contoh sdg_start_year = 2016): * row baru (year >= sdg_start_year) -> 'SDGs'
Indikator shared (anemia, stunting, wasting, undernourishment):
data mulai 2013 -> year 2013, 2014, 2015 = 'MDGs' (year < 2016)
-> year 2016, 2017, ... = 'SDGs' (year >= 2016)
=> SPLIT: sebagian MDGs, sebagian SDGs ✓
Indikator FIES (murni SDGs):
data mulai 2016 (== sdg_start_year) -> seluruh baris = 'SDGs'
=> Selalu SDGs (tidak ada baris sebelum 2016) ✓
Indikator di luar SDG_INDICATOR_KEYWORDS:
-> selalu 'MDGs', tidak peduli tahunnya ✓
""" """
import pandas as pd import pandas as pd
@@ -71,14 +61,16 @@ from google.cloud import bigquery
# ============================================================================= # =============================================================================
# SDG INDICATOR KEYWORDS # SDG INDICATOR KEYWORDS
# Indikator yang termasuk SDG framework (target 2.1 & 2.2). # Daftar nama indikator (lowercase) yang masuk SDG framework.
# Framework per baris ditentukan oleh sdg_start_year global (dari FIES proxy). # Indikator ini akan di-assign 'SDGs' untuk baris dengan year >= sdg_start_year,
# dan 'MDGs' untuk baris dengan year < sdg_start_year.
# ============================================================================= # =============================================================================
SDG_INDICATOR_KEYWORDS = frozenset([ SDG_INDICATOR_KEYWORDS = frozenset([
# TARGET 2.1.1 — Prevalence of undernourishment (shared: ada sebelum SDGs) # TARGET 2.1.1 — Prevalence of undernourishment (shared: ada sebelum SDGs)
"prevalence of undernourishment (percent) (3-year average)", "prevalence of undernourishment (percent) (3-year average)",
"number of people undernourished (million) (3-year average)", "number of people undernourished (million) (3-year average)",
# TARGET 2.1.2 — FIES (murni baru di era SDGs) # TARGET 2.1.2 — FIES (SDGs only — murni baru di era SDGs)
"prevalence of severe food insecurity in the total population (percent) (3-year average)", "prevalence of severe food insecurity in the total population (percent) (3-year average)",
"prevalence of severe food insecurity in the male adult population (percent) (3-year average)", "prevalence of severe food insecurity in the male adult population (percent) (3-year average)",
"prevalence of severe food insecurity in the female adult population (percent) (3-year average)", "prevalence of severe food insecurity in the female adult population (percent) (3-year average)",
@@ -99,19 +91,23 @@ SDG_INDICATOR_KEYWORDS = frozenset([
"number of children under 5 years affected by wasting (million)", "number of children under 5 years affected by wasting (million)",
"percentage of children under 5 years of age who are overweight (modelled estimates) (percent)", "percentage of children under 5 years of age who are overweight (modelled estimates) (percent)",
"number of children under 5 years of age who are overweight (modeled estimates) (million)", "number of children under 5 years of age who are overweight (modeled estimates) (million)",
# TARGET 2.2.3 — Anaemia (shared: ada sebelum SDGs) # TARGET 2.2.3 — Anaemia (shared: data ada sebelum SDGs, listed here agar
# baris >= sdg_start_year di-assign 'SDGs')
"prevalence of anemia among women of reproductive age (15-49 years) (percent)", "prevalence of anemia among women of reproductive age (15-49 years) (percent)",
"number of women of reproductive age (15-49 years) affected by anemia (million)", "number of women of reproductive age (15-49 years) affected by anemia (million)",
]) ])
# ============================================================================= # =============================================================================
# SDG ERA PROXY KEYWORDS # SDG ERA PROXY KEYWORDS
# HANYA FIES — dipakai HANYA untuk mendeteksi sdg_start_year dari data. # HANYA indikator yang MURNI baru di era SDGs (FIES saja).
# Dipakai untuk mendeteksi sdg_start_year dari data.
# #
# KRITISanemia/stunting/wasting/undernourishment TIDAK boleh ada di sini: # PENTINGAnemia/anaemia TIDAK dipakai sebagai proxy:
# Data mereka sudah ada sebelum era SDGs sehingga actual_start_year < sdg_start_year. # Data anemia sudah ada sebelum era SDGs sehingga actual_start_year-nya
# Jika dipakai sebagai proxy, sdg_start_year terdeteksi terlalu awal (misal 2013) # lebih awal dari sdg_start_year. Jika dipakai sebagai proxy, sdg_start_year
# sehingga seluruh baris indikator shared menjadi 'SDGs' — SALAH. # akan terdeteksi terlalu awal dan seluruh baris anemia akan menjadi 'SDGs'.
# FIES adalah satu-satunya indikator yang benar-benar murni baru di era SDGs
# dan dapat dipakai sebagai penanda tahun mulainya era SDGs.
# ============================================================================= # =============================================================================
_SDG_ERA_PROXY_KEYWORDS = frozenset([ _SDG_ERA_PROXY_KEYWORDS = frozenset([
"food insecurity", "food insecurity",
@@ -121,13 +117,21 @@ _SDG_ERA_PROXY_KEYWORDS = frozenset([
# ============================================================================= # =============================================================================
# THRESHOLD KONDISI (fixed absolute, skala 1-100) # THRESHOLD KONDISI (fixed absolute, skala 1-100)
# ============================================================================= # =============================================================================
# Digunakan untuk assign kondisi di analysis_layer.
# Didefinisikan di sini agar konsisten antara kedua file.
# bad : norm_value_1_100 < THRESHOLD_BAD
# good : norm_value_1_100 > THRESHOLD_GOOD
# moderate : di antara keduanya
THRESHOLD_BAD = 40.0 THRESHOLD_BAD = 40.0
THRESHOLD_GOOD = 60.0 THRESHOLD_GOOD = 60.0
def assign_condition(norm_value_1_100: float) -> str: def assign_condition(norm_value_1_100: float) -> str:
""" """
Assign kondisi berdasarkan norm_value_1_100 (skala 1-100, direction-aware). Assign kondisi berdasarkan norm_value_1_100 (skala 1-100, sudah direction-aware).
Nilai tinggi selalu berarti lebih baik (lower_better sudah diinvert).
Returns: 'good' / 'moderate' / 'bad' Returns: 'good' / 'moderate' / 'bad'
""" """
if pd.isna(norm_value_1_100): if pd.isna(norm_value_1_100):
@@ -145,27 +149,30 @@ def assign_framework_per_row(
sdg_start_year: int, sdg_start_year: int,
) -> str: ) -> str:
""" """
Tentukan framework (MDGs/SDGs) per BARIS menggunakan sdg_start_year GLOBAL. Tentukan framework (MDGs/SDGs) per BARIS (per row year), bukan per indikator.
Rules: Logic:
1. Indikator TIDAK ada di SDG_INDICATOR_KEYWORDS -> selalu 'MDGs' - 'SDGs' jika KEDUA kondisi terpenuhi:
2. Indikator ada di SDG_INDICATOR_KEYWORDS: 1. Nama indikator ada di SDG_INDICATOR_KEYWORDS
- year >= sdg_start_year -> 'SDGs' 2. year (tahun baris ini) >= sdg_start_year
- year < sdg_start_year -> 'MDGs' - 'MDGs' untuk semua kasus lain.
sdg_start_year dideteksi dari FIES (proxy murni SDGs), bukan dari Mengapa per row, bukan per indikator?
actual_start_year masing-masing indikator. Ini memastikan indikator Indikator "shared" seperti anemia, stunting, wasting, undernourishment
shared (anemia, stunting, wasting, undernourishment) yang datanya memiliki data yang ada SEBELUM era SDGs dimulai. Jika assign dilakukan
ada sebelum SDGs tetap mendapat label 'MDGs' untuk baris sebelum per indikator menggunakan actual_start_year, indikator-indikator ini
sdg_start_year dan 'SDGs' untuk baris sejak sdg_start_year. akan selalu di-assign 'MDGs' karena actual_start_year < sdg_start_year.
Dengan assign per row menggunakan year baris:
- baris lama (year < sdg_start_year) -> 'MDGs' (benar: belum era SDGs)
- baris baru (year >= sdg_start_year) -> 'SDGs' (benar: sudah era SDGs)
Contoh (sdg_start_year = 2016): Contoh anemia (sdg_start_year = 2016):
anemia year=2013 -> 'MDGs' (ada di SDG list, tapi year < 2016) - row year=2013 -> 'MDGs'
anemia year=2015 -> 'MDGs' - row year=2014 -> 'MDGs'
anemia year=2016 -> 'SDGs' (year >= 2016) - row year=2015 -> 'MDGs'
anemia year=2023 -> 'SDGs' - row year=2016 -> 'SDGs'
FIES year=2016 -> 'SDGs' (tidak ada baris FIES sebelum 2016) - row year=2017 -> 'SDGs'
non-SDG year=any -> 'MDGs' (tidak ada di SDG_INDICATOR_KEYWORDS) - ...
""" """
name_lower = str(indicator_name).lower().strip() name_lower = str(indicator_name).lower().strip()
in_sdg_list = name_lower in SDG_INDICATOR_KEYWORDS in_sdg_list = name_lower in SDG_INDICATOR_KEYWORDS
@@ -180,28 +187,21 @@ def assign_framework_per_row(
class AnalyticalLayerLoader: class AnalyticalLayerLoader:
""" """
Analytical Layer Loader for BigQuery. Analytical Layer Loader for BigQuery
Output kolom fact_asean_food_security_selected: Output kolom fact_asean_food_security_selected:
country_id, country_name, country_id, country_name,
indicator_id, indicator_name, direction, framework, indicator_id, indicator_name, direction, framework,
pillar_id, pillar_name, pillar_id, pillar_name,
time_id, year, value, time_id, year, value,
norm_value_1_100, norm_value_1_100, <- min-max norm per indikator, skala 1-100, direction-aware
yoy_change, yoy_pct yoy_change, yoy_pct
Framework logic (sdg_start_year global dari FIES proxy): Catatan framework:
Indikator shared (anemia, stunting, wasting, undernourishment): Framework di-assign PER BARIS (per year), sehingga indikator shared
year < sdg_start_year -> 'MDGs' (misal 2013-2015) seperti anemia dapat memiliki framework berbeda di baris yang berbeda:
year >= sdg_start_year -> 'SDGs' (misal 2016-2023) - baris sebelum sdg_start_year -> 'MDGs'
=> SPLIT: sebagian MDGs, sebagian SDGs - baris sejak sdg_start_year -> 'SDGs'
Indikator FIES (murni SDGs):
seluruh baris -> 'SDGs'
(tidak ada data FIES sebelum sdg_start_year) ✓
Indikator di luar SDG_INDICATOR_KEYWORDS:
seluruh baris -> 'MDGs'
""" """
def __init__(self, client: bigquery.Client): def __init__(self, client: bigquery.Client):
@@ -218,9 +218,9 @@ class AnalyticalLayerLoader:
self.start_year = 2013 self.start_year = 2013
self.end_year = None self.end_year = None
self.baseline_year = 2023 # hardcode per syarat dosen self.baseline_year = 2023 # hardcode per syarat dosen (tahun terlengkap)
self.sdg_start_year = None # dideteksi HANYA dari FIES proxy di Step 6 self.sdg_start_year = None
self.pipeline_metadata = { self.pipeline_metadata = {
'source_class' : self.__class__.__name__, 'source_class' : self.__class__.__name__,
@@ -306,6 +306,7 @@ class AnalyticalLayerLoader:
self.logger.info("STEP 2: DETERMINE YEAR BOUNDARIES") self.logger.info("STEP 2: DETERMINE YEAR BOUNDARIES")
self.logger.info("=" * 80) self.logger.info("=" * 80)
# Filter single years only (is_year_range == False)
if 'is_year_range' in self.df_clean.columns: if 'is_year_range' in self.df_clean.columns:
before = len(self.df_clean) before = len(self.df_clean)
self.df_clean = self.df_clean[self.df_clean['is_year_range'] == False].copy() self.df_clean = self.df_clean[self.df_clean['is_year_range'] == False].copy()
@@ -313,6 +314,7 @@ class AnalyticalLayerLoader:
f" Filter single years only: {before:,} -> {len(self.df_clean):,} rows" f" Filter single years only: {before:,} -> {len(self.df_clean):,} rows"
) )
# baseline_year = 2023 hardcode (syarat dosen: minimal 2023)
df_baseline = self.df_clean[self.df_clean['year'] == self.baseline_year] df_baseline = self.df_clean[self.df_clean['year'] == self.baseline_year]
baseline_indicator_count = df_baseline['indicator_id'].nunique() baseline_indicator_count = df_baseline['indicator_id'].nunique()
@@ -540,7 +542,6 @@ class AnalyticalLayerLoader:
self.df_clean['indicator_id'].isin(valid_indicators) self.df_clean['indicator_id'].isin(valid_indicators)
].copy() ].copy()
# Trim baris di bawah max_start_year per indikator
self.df_clean = self.df_clean.merge( self.df_clean = self.df_clean.merge(
indicator_max_start[['indicator_id', 'max_start_year']], indicator_max_start[['indicator_id', 'max_start_year']],
on='indicator_id', how='left' on='indicator_id', how='left'
@@ -566,16 +567,13 @@ class AnalyticalLayerLoader:
self.logger.info("STEP 6: DETERMINE SDG START YEAR & ASSIGN FRAMEWORK PER ROW") self.logger.info("STEP 6: DETERMINE SDG START YEAR & ASSIGN FRAMEWORK PER ROW")
self.logger.info("=" * 80) self.logger.info("=" * 80)
self.logger.info( self.logger.info(
" sdg_start_year dideteksi HANYA dari FIES proxy\n" " Proxy: FIES only (food insecurity/food insecure).\n"
" (food insecurity / food insecure — murni baru di era SDGs).\n" " Anemia TIDAK dipakai sebagai proxy — datanya ada sebelum era SDGs.\n"
" Anemia/stunting/wasting/undernourishment TIDAK dipakai sebagai proxy.\n\n" " Framework di-assign PER BARIS (year), bukan per indikator."
" Framework per baris (threshold = sdg_start_year global):\n"
" SDG_INDICATOR_KEYWORDS + year >= sdg_start_year -> 'SDGs'\n"
" SDG_INDICATOR_KEYWORDS + year < sdg_start_year -> 'MDGs' [SPLIT]\n"
" Indikator di luar SDG_INDICATOR_KEYWORDS -> selalu 'MDGs'"
) )
# Hitung actual_start_year per indikator (untuk logging & validasi) # actual_start_year per indikator = max(min_year per country)
# = konsisten dengan max_start_year di Step 5
indicator_actual_start = ( indicator_actual_start = (
self.df_clean self.df_clean
.groupby(['indicator_id', 'indicator_name', 'country_id'])['year'] .groupby(['indicator_id', 'indicator_name', 'country_id'])['year']
@@ -585,9 +583,7 @@ class AnalyticalLayerLoader:
) )
indicator_actual_start.columns = ['indicator_id', 'indicator_name', 'actual_start_year'] indicator_actual_start.columns = ['indicator_id', 'indicator_name', 'actual_start_year']
# ------------------------------------------------------------------ # Deteksi sdg_start_year dari proxy SDGs-only (FIES saja, BUKAN anemia)
# Deteksi sdg_start_year HANYA dari FIES proxy
# ------------------------------------------------------------------
proxy_mask = indicator_actual_start['indicator_name'].str.lower().apply( proxy_mask = indicator_actual_start['indicator_name'].str.lower().apply(
lambda n: any(kw in n for kw in _SDG_ERA_PROXY_KEYWORDS) lambda n: any(kw in n for kw in _SDG_ERA_PROXY_KEYWORDS)
) )
@@ -595,46 +591,22 @@ class AnalyticalLayerLoader:
if df_proxy.empty: if df_proxy.empty:
raise ValueError( raise ValueError(
"Tidak ada indikator FIES (food insecurity/food insecure) yang lolos filter. " "Tidak ada indikator proxy SDGs (FIES) yang lolos filter. "
"Pastikan indikator FIES ada di data dan lolos Step 3-5." "Pastikan indikator FIES (food insecurity/food insecure) ada di data."
) )
self.sdg_start_year = int(df_proxy['actual_start_year'].min()) self.sdg_start_year = int(df_proxy['actual_start_year'].min())
self.logger.info(f"\n sdg_start_year = {self.sdg_start_year}")
self.logger.info(f"\n sdg_start_year = {self.sdg_start_year} (dari FIES proxy)") self.logger.info(f" Proxy indicators (FIES only):")
self.logger.info(f" FIES proxy indicators:")
for _, row in df_proxy.iterrows(): for _, row in df_proxy.iterrows():
self.logger.info(f" [{int(row['actual_start_year'])}] {row['indicator_name']}") self.logger.info(f" [{int(row['actual_start_year'])}] {row['indicator_name']}")
# Log indikator shared yang akan split (ada di SDG list, data mulai sebelum sdg_start_year) # ----------------------------------------------------------------
shared_sdg = indicator_actual_start[ # Assign framework PER BARIS menggunakan year baris, bukan actual_start_year
~proxy_mask & # Sehingga indikator "shared" (anemia, stunting, dll) mendapat:
indicator_actual_start['indicator_name'].str.lower().isin(SDG_INDICATOR_KEYWORDS) & # - 'MDGs' untuk baris sebelum sdg_start_year
(indicator_actual_start['actual_start_year'] < self.sdg_start_year) # - 'SDGs' untuk baris sejak sdg_start_year
] # ----------------------------------------------------------------
if not shared_sdg.empty:
self.logger.info(
f"\n Indikator shared yang akan SPLIT MDGs/SDGs "
f"(data mulai < sdg_start_year={self.sdg_start_year}):"
)
for _, row in shared_sdg.iterrows():
n_mdgs = len(self.df_clean[
(self.df_clean['indicator_id'] == row['indicator_id']) &
(self.df_clean['year'] < self.sdg_start_year)
])
n_sdgs = len(self.df_clean[
(self.df_clean['indicator_id'] == row['indicator_id']) &
(self.df_clean['year'] >= self.sdg_start_year)
])
self.logger.info(
f" [actual_start={int(row['actual_start_year'])}] "
f"{row['indicator_name'][:50]} "
f"| MDGs rows: {n_mdgs:,} | SDGs rows: {n_sdgs:,}"
)
# ------------------------------------------------------------------
# Assign framework PER BARIS menggunakan sdg_start_year global
# ------------------------------------------------------------------
self.df_clean['framework'] = self.df_clean.apply( self.df_clean['framework'] = self.df_clean.apply(
lambda row: assign_framework_per_row( lambda row: assign_framework_per_row(
indicator_name = row['indicator_name'], indicator_name = row['indicator_name'],
@@ -644,9 +616,9 @@ class AnalyticalLayerLoader:
axis=1 axis=1
) )
# ------------------------------------------------------------------ # ----------------------------------------------------------------
# Logging ringkasan per indikator # Logging: ringkasan per indikator (frameworks apa yang muncul)
# ------------------------------------------------------------------ # ----------------------------------------------------------------
ind_fw_summary = ( ind_fw_summary = (
self.df_clean self.df_clean
.groupby(['indicator_id', 'indicator_name'])['framework'] .groupby(['indicator_id', 'indicator_name'])['framework']
@@ -662,9 +634,9 @@ class AnalyticalLayerLoader:
) )
self.logger.info(f"\n Framework assignment per indikator:") self.logger.info(f"\n Framework assignment per indikator:")
self.logger.info(f" {'-'*90}") self.logger.info(f" {'-'*85}")
self.logger.info(f" {'ID':<5} {'Frameworks':<18} {'ActualStart':<13} {'Indicator Name'}") self.logger.info(f" {'ID':<5} {'Frameworks':<18} {'ActualStart':<13} {'Indicator Name'}")
self.logger.info(f" {'-'*90}") self.logger.info(f" {'-'*85}")
for _, row in ind_fw_summary.sort_values( for _, row in ind_fw_summary.sort_values(
['frameworks', 'actual_start_year', 'indicator_name'] ['frameworks', 'actual_start_year', 'indicator_name']
).iterrows(): ).iterrows():
@@ -673,48 +645,24 @@ class AnalyticalLayerLoader:
f"{int(row['actual_start_year']):<13} {row['indicator_name'][:48]}" f"{int(row['actual_start_year']):<13} {row['indicator_name'][:48]}"
) )
# Ringkasan per kategori # Indikator dengan framework split (MDGs/SDGs) — highlight untuk validasi
mdgs_only = ind_fw_summary[ind_fw_summary['frameworks'] == 'MDGs']
sdgs_only = ind_fw_summary[ind_fw_summary['frameworks'] == 'SDGs']
split_inds = ind_fw_summary[ind_fw_summary['frameworks'] == 'MDGs/SDGs'] split_inds = ind_fw_summary[ind_fw_summary['frameworks'] == 'MDGs/SDGs']
if not mdgs_only.empty:
self.logger.info(
f"\n [MDGs only — {len(mdgs_only)} indikator] "
f"Tidak ada di SDG_INDICATOR_KEYWORDS:"
)
for _, row in mdgs_only.iterrows():
self.logger.info(f" - {row['indicator_name'][:65]}")
if not sdgs_only.empty:
self.logger.info(
f"\n [SDGs only — {len(sdgs_only)} indikator] "
f"Data mulai = sdg_start_year, tidak ada baris sebelumnya:"
)
for _, row in sdgs_only.iterrows():
self.logger.info(
f" - [{int(row['actual_start_year'])}] {row['indicator_name'][:65]}"
)
if not split_inds.empty: if not split_inds.empty:
self.logger.info( self.logger.info(
f"\n [SPLIT MDGs/SDGs — {len(split_inds)} indikator] " f"\n [INFO] {len(split_inds)} indikator memiliki framework split "
f"Baris < {self.sdg_start_year} = MDGs | " f"(MDGs sebelum {self.sdg_start_year}, SDGs sejak {self.sdg_start_year}):"
f"Baris >= {self.sdg_start_year} = SDGs:"
) )
for _, row in split_inds.iterrows(): for _, row in split_inds.iterrows():
self.logger.info( self.logger.info(f" - {row['indicator_name'][:60]}")
f" - [actual_start={int(row['actual_start_year'])}] "
f"{row['indicator_name'][:65]}"
)
fw_summary = self.df_clean['framework'].value_counts() fw_summary = self.df_clean['framework'].value_counts()
self.logger.info( self.logger.info(
f"\n Ringkasan rows: " + f"\n Ringkasan rows: " +
" | ".join(f"{fw}: {cnt:,}" for fw, cnt in fw_summary.items()) " | ".join(f"{fw}: {cnt:,}" for fw, cnt in fw_summary.items())
) )
self.logger.info( self.logger.info(
f"\n [OK] 'framework' ditambahkan — " f"\n [OK] 'framework' ditambahkan per row "
f"MDGs: {(self.df_clean['framework'] == 'MDGs').sum():,} rows | " f"MDGs: {(self.df_clean['framework'] == 'MDGs').sum():,} rows | "
f"SDGs: {(self.df_clean['framework'] == 'SDGs').sum():,} rows" f"SDGs: {(self.df_clean['framework'] == 'SDGs').sum():,} rows"
) )
@@ -756,6 +704,25 @@ class AnalyticalLayerLoader:
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def calculate_norm_value(self): def calculate_norm_value(self):
"""
Hitung norm_value_1_100 per indikator — min-max normalisasi skala 1-100,
direction-aware.
CARA KERJA:
- Normalisasi dilakukan GLOBAL per indikator (semua negara + semua tahun sekaligus)
sehingga nilai antar negara dan antar tahun tetap comparable.
- lower_better diinvert: nilai tinggi selalu = kondisi lebih baik.
Contoh: undernourishment 5% (rendah = baik) → norm tinggi setelah invert.
- Skala 1-100 (bukan 0-100) untuk menghindari nilai absolut nol di Looker Studio.
- Kolom ini memungkinkan perbandingan lintas indikator yang berbeda satuan
(persen, juta orang, dll) karena sudah dinormalisasi ke skala yang sama.
Catatan:
- Berbeda dengan norm_value di _get_norm_value_df() di analysis_layer
yang skala 0-1 dan dipakai untuk agregasi composite score.
- norm_value_1_100 ini adalah per baris (per country per year per indicator),
untuk ditampilkan langsung di Looker Studio.
"""
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 8: CALCULATE NORM_VALUE_1_100 PER INDICATOR") self.logger.info("STEP 8: CALCULATE NORM_VALUE_1_100 PER INDICATOR")
self.logger.info("=" * 80) self.logger.info("=" * 80)
@@ -768,10 +735,7 @@ class AnalyticalLayerLoader:
norm_parts = [] norm_parts = []
indicators = df.groupby(['indicator_id', 'indicator_name', 'direction']) indicators = df.groupby(['indicator_id', 'indicator_name', 'direction'])
self.logger.info( self.logger.info(f"\n {'ID':<5} {'Direction':<15} {'Invert':<8} {'Min':>10} {'Max':>10} {'Indicator Name'}")
f"\n {'ID':<5} {'Direction':<15} {'Invert':<8} "
f"{'Min':>10} {'Max':>10} {'Indicator Name'}"
)
self.logger.info(f" {'-'*90}") self.logger.info(f" {'-'*90}")
for (ind_id, ind_name, direction), grp in indicators: for (ind_id, ind_name, direction), grp in indicators:
@@ -785,17 +749,21 @@ class AnalyticalLayerLoader:
norm_parts.append(grp) norm_parts.append(grp)
continue continue
raw = grp.loc[valid_mask, 'value'].values raw = grp.loc[valid_mask, 'value'].values
v_min = raw.min() v_min = raw.min()
v_max = raw.max() v_max = raw.max()
normed = np.full(len(grp), np.nan) normed = np.full(len(grp), np.nan)
if v_min == v_max: if v_min == v_max:
# Semua nilai sama → beri nilai tengah (50.5 pada skala 1-100)
normed[valid_mask.values] = 50.5 normed[valid_mask.values] = 50.5
else: else:
# Min-max ke 0-1 dulu
scaled = (raw - v_min) / (v_max - v_min) scaled = (raw - v_min) / (v_max - v_min)
# Invert jika lower_better
if do_invert: if do_invert:
scaled = 1.0 - scaled scaled = 1.0 - scaled
# Scale ke 1-100
normed[valid_mask.values] = 1.0 + scaled * 99.0 normed[valid_mask.values] = 1.0 + scaled * 99.0
grp['norm_value_1_100'] = normed grp['norm_value_1_100'] = normed
@@ -808,6 +776,7 @@ class AnalyticalLayerLoader:
self.df_clean = pd.concat(norm_parts, ignore_index=True) self.df_clean = pd.concat(norm_parts, ignore_index=True)
# Statistik ringkasan
valid_norm = self.df_clean['norm_value_1_100'].notna().sum() valid_norm = self.df_clean['norm_value_1_100'].notna().sum()
null_norm = self.df_clean['norm_value_1_100'].isna().sum() null_norm = self.df_clean['norm_value_1_100'].isna().sum()
self.logger.info(f"\n norm_value_1_100 — valid: {valid_norm:,} | null: {null_norm:,}") self.logger.info(f"\n norm_value_1_100 — valid: {valid_norm:,} | null: {null_norm:,}")
@@ -817,17 +786,15 @@ class AnalyticalLayerLoader:
f"{self.df_clean['norm_value_1_100'].max():.2f}" f"{self.df_clean['norm_value_1_100'].max():.2f}"
) )
# Log distribusi kondisi berdasarkan threshold
self.df_clean['_condition_preview'] = self.df_clean['norm_value_1_100'].apply(assign_condition) self.df_clean['_condition_preview'] = self.df_clean['norm_value_1_100'].apply(assign_condition)
cond_dist = self.df_clean['_condition_preview'].value_counts() cond_dist = self.df_clean['_condition_preview'].value_counts()
self.logger.info( self.logger.info(f"\n Distribusi kondisi (threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}):")
f"\n Distribusi kondisi "
f"(threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}):"
)
for cond, cnt in cond_dist.items(): for cond, cnt in cond_dist.items():
self.logger.info(f" {cond}: {cnt:,} rows") self.logger.info(f" {cond}: {cnt:,} rows")
self.df_clean = self.df_clean.drop(columns=['_condition_preview']) self.df_clean = self.df_clean.drop(columns=['_condition_preview'])
self.logger.info(f"\n [OK] Kolom 'norm_value_1_100' ditambahkan") self.logger.info(f"\n [OK] Kolom 'norm_value_1_100' ditambahkan ke df_clean")
return self.df_clean return self.df_clean
# ------------------------------------------------------------------ # ------------------------------------------------------------------
@@ -895,6 +862,7 @@ class AnalyticalLayerLoader:
'start_year', 'end_year', 'country_count' 'start_year', 'end_year', 'country_count'
] ]
# Framework summary per indikator (bisa MDGs, SDGs, atau MDGs/SDGs split)
ind_fw = ( ind_fw = (
self.df_clean self.df_clean
.groupby('indicator_id')['framework'] .groupby('indicator_id')['framework']
@@ -995,11 +963,13 @@ class AnalyticalLayerLoader:
self.logger.info(f" Total rows: {len(analytical_df):,}") self.logger.info(f" Total rows: {len(analytical_df):,}")
# Framework distribution per row
fw_dist_rows = analytical_df['framework'].value_counts() fw_dist_rows = analytical_df['framework'].value_counts()
self.logger.info(f" Framework distribution (rows):") self.logger.info(f" Framework distribution (rows):")
for fw, cnt in fw_dist_rows.items(): for fw, cnt in fw_dist_rows.items():
self.logger.info(f" {fw}: {cnt:,} rows") self.logger.info(f" {fw}: {cnt:,} rows")
# Framework distribution per indikator (label)
ind_fw_label = ( ind_fw_label = (
analytical_df analytical_df
.groupby('indicator_id')['framework'] .groupby('indicator_id')['framework']
@@ -1058,11 +1028,7 @@ class AnalyticalLayerLoader:
'sdg_start_year' : self.sdg_start_year, 'sdg_start_year' : self.sdg_start_year,
'fixed_countries' : len(self.selected_country_ids), 'fixed_countries' : len(self.selected_country_ids),
'norm_scale' : '1-100 per indicator global minmax direction-aware', 'norm_scale' : '1-100 per indicator global minmax direction-aware',
'framework_assignment' : ( 'framework_assignment' : 'per-row by year (not per-indicator)',
f'per-row, sdg_start_year={self.sdg_start_year} global (FIES proxy only). '
'SDG_INDICATOR_KEYWORDS + year >= sdg_start_year -> SDGs, else MDGs. '
'Shared indicators (anemia/stunting/wasting/undernourishment) split MDGs/SDGs.'
),
'sdg_proxy_keywords' : list(_SDG_ERA_PROXY_KEYWORDS), 'sdg_proxy_keywords' : list(_SDG_ERA_PROXY_KEYWORDS),
'condition_thresholds' : { 'condition_thresholds' : {
'bad' : f'< {THRESHOLD_BAD}', 'bad' : f'< {THRESHOLD_BAD}',
@@ -1098,12 +1064,8 @@ class AnalyticalLayerLoader:
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("Output: fact_asean_food_security_selected -> fs_asean_gold") self.logger.info("Output: fact_asean_food_security_selected -> fs_asean_gold")
self.logger.info("Kolom baru: norm_value_1_100 (min-max 1-100, direction-aware)") self.logger.info("Kolom baru: norm_value_1_100 (min-max 1-100, direction-aware)")
self.logger.info( self.logger.info("Framework: per-row by year (shared indicators split MDGs/SDGs)")
"Framework: per-row, threshold = sdg_start_year global (dari FIES proxy)\n" self.logger.info(f"SDG Proxy: FIES only (food insecurity/food insecure)")
" SDG_INDICATOR_KEYWORDS + year >= sdg_start_year -> 'SDGs'\n"
" SDG_INDICATOR_KEYWORDS + year < sdg_start_year -> 'MDGs' [SPLIT]\n"
" Indikator di luar SDG_INDICATOR_KEYWORDS -> selalu 'MDGs'"
)
self.logger.info(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}") self.logger.info(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
self.logger.info("=" * 80) self.logger.info("=" * 80)
@@ -1112,10 +1074,10 @@ class AnalyticalLayerLoader:
self.filter_complete_indicators_per_country() self.filter_complete_indicators_per_country()
self.select_countries_with_all_pillars() self.select_countries_with_all_pillars()
self.filter_indicators_consistent_across_fixed_countries() self.filter_indicators_consistent_across_fixed_countries()
self.determine_sdg_start_year() self.determine_sdg_start_year() # Step 6: per-row framework assignment
self.verify_no_gaps() self.verify_no_gaps()
self.calculate_norm_value() self.calculate_norm_value() # Step 8: norm_value_1_100
self.calculate_yoy() self.calculate_yoy() # Step 9: yoy_change, yoy_pct
self.analyze_indicator_availability_by_year() self.analyze_indicator_availability_by_year()
self.save_analytical_table() self.save_analytical_table()
@@ -1127,7 +1089,7 @@ class AnalyticalLayerLoader:
self.logger.info("=" * 80) self.logger.info("=" * 80)
self.logger.info(f" Duration : {duration:.2f}s") self.logger.info(f" Duration : {duration:.2f}s")
self.logger.info(f" Year Range : {self.start_year}-{self.end_year}") self.logger.info(f" Year Range : {self.start_year}-{self.end_year}")
self.logger.info(f" SDG Start Yr : {self.sdg_start_year} (dari FIES proxy)") self.logger.info(f" SDG Start Yr : {self.sdg_start_year}")
self.logger.info(f" Countries : {len(self.selected_country_ids)}") self.logger.info(f" Countries : {len(self.selected_country_ids)}")
self.logger.info(f" Indicators : {self.df_clean['indicator_id'].nunique()}") self.logger.info(f" Indicators : {self.df_clean['indicator_id'].nunique()}")
self.logger.info(f" Rows Loaded : {self.pipeline_metadata['rows_loaded']:,}") self.logger.info(f" Rows Loaded : {self.pipeline_metadata['rows_loaded']:,}")
@@ -1154,12 +1116,7 @@ if __name__ == "__main__":
print("BIGQUERY ANALYTICAL LAYER - DATA FILTERING") print("BIGQUERY ANALYTICAL LAYER - DATA FILTERING")
print("Output: fact_asean_food_security_selected -> fs_asean_gold") print("Output: fact_asean_food_security_selected -> fs_asean_gold")
print(f"Norm: min-max 1-100 per indicator, direction-aware") print(f"Norm: min-max 1-100 per indicator, direction-aware")
print( print(f"Framework: per-row by year | SDG Proxy: FIES only")
"Framework: per-row, threshold = sdg_start_year global (dari FIES proxy)\n"
" SDG_INDICATOR_KEYWORDS + year >= sdg_start_year -> SDGs\n"
" SDG_INDICATOR_KEYWORDS + year < sdg_start_year -> MDGs [SPLIT]\n"
" Indikator di luar SDG_INDICATOR_KEYWORDS -> selalu MDGs"
)
print(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}") print(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
print("=" * 80) print("=" * 80)