sdg start year and label condition

This commit is contained in:
Debby
2026-03-31 15:42:11 +07:00
parent ddc9fb3b48
commit beb494f89c
2 changed files with 513 additions and 690 deletions

View File

@@ -4,9 +4,17 @@ Semua agregasi pakai norm_value dari _get_norm_value_df()
UPDATED: UPDATED:
- _classify_indicators() membaca kolom 'framework' langsung dari - _classify_indicators() membaca kolom 'framework' langsung dari
fact_asean_food_security_selected (bukan heuristik gap min_year). fact_asean_food_security_selected (sudah di-assign di analytical_layer
- Kolom 'framework' sudah ditanam sejak bigquery_cleaned_layer.py berdasarkan SDG_INDICATOR_KEYWORDS + actual_start_year).
berdasarkan daftar eksplisit SDG Goal 2 (2030 Agenda, versi Maret 2020). - Kolom 'condition' (good/moderate/bad) ditambahkan ke semua tabel agregasi:
* agg_pillar_composite
* agg_pillar_by_country
* agg_framework_by_country
* agg_framework_asean
Threshold fixed absolute (skala 1-100, direction-aware):
bad : score < 40
moderate : 40 <= score <= 60
good : score > 60
Simpan 6 tabel ke fs_asean_gold (layer='gold'): Simpan 6 tabel ke fs_asean_gold (layer='gold'):
- agg_pillar_composite - agg_pillar_composite
@@ -17,7 +25,6 @@ Simpan 6 tabel ke fs_asean_gold (layer='gold'):
- agg_narrative_pillar - agg_narrative_pillar
SOURCE TABLE: fact_asean_food_security_selected SOURCE TABLE: fact_asean_food_security_selected
(sudah include country_name, indicator_name, pillar_name, direction, framework)
""" """
import pandas as pd import pandas as pd
@@ -52,6 +59,25 @@ DIRECTION_POSITIVE_KEYWORDS = frozenset({
NORMALIZE_FRAMEWORKS_JOINTLY = False NORMALIZE_FRAMEWORKS_JOINTLY = False
# Threshold kondisi — fixed absolute, skala 1-100
# Konsisten dengan THRESHOLD_BAD / THRESHOLD_GOOD di analytical_layer
THRESHOLD_BAD = 40.0
THRESHOLD_GOOD = 60.0
def assign_condition(score) -> str:
"""
Assign kondisi berdasarkan score skala 1-100 (direction-aware, nilai tinggi = lebih baik).
Returns: 'good' / 'moderate' / 'bad' / None jika NaN
"""
if score is None or (isinstance(score, float) and np.isnan(score)):
return None
if score > THRESHOLD_GOOD:
return 'good'
if score < THRESHOLD_BAD:
return 'bad'
return 'moderate'
# ============================================================================= # =============================================================================
# Windows CP1252 safe logging # Windows CP1252 safe logging
@@ -145,6 +171,24 @@ def check_and_dedup(
return df return df
def add_condition_column(df: pd.DataFrame, score_col: str) -> pd.DataFrame:
"""
Tambahkan kolom 'condition' berdasarkan score_col.
Threshold: bad < 40, moderate 40-60, good > 60 (skala 1-100).
"""
df['condition'] = df[score_col].apply(assign_condition)
return df
def log_condition_summary(df: pd.DataFrame, context: str, logger) -> None:
"""Log distribusi kondisi untuk verifikasi."""
dist = df['condition'].value_counts()
logger.info(
f" Condition distribution ({context}): " +
" | ".join(f"{c}: {n:,}" for c, n in dist.items())
)
# ============================================================================= # =============================================================================
# NARRATIVE BUILDER FUNCTIONS # NARRATIVE BUILDER FUNCTIONS
# ============================================================================= # =============================================================================
@@ -163,20 +207,10 @@ def _fmt_delta(delta) -> str:
def _build_overview_narrative( def _build_overview_narrative(
year: int, year, n_mdg, n_sdg, n_total_ind, score, yoy_val, yoy_pct,
n_mdg: int, prev_year, prev_score, ranking_list,
n_sdg: int, most_improved_country, most_improved_delta,
n_total_ind: int, most_declined_country, most_declined_delta,
score: float,
yoy_val,
yoy_pct,
prev_year: int,
prev_score,
ranking_list: list,
most_improved_country,
most_improved_delta,
most_declined_country,
most_declined_delta,
) -> str: ) -> str:
parts_ind = [] parts_ind = []
if n_mdg > 0: if n_mdg > 0:
@@ -220,7 +254,6 @@ def _build_overview_narrative(
first = ranking_list[0] first = ranking_list[0]
last = ranking_list[-1] last = ranking_list[-1]
middle = ranking_list[1:-1] middle = ranking_list[1:-1]
if len(ranking_list) == 1: if len(ranking_list) == 1:
sent3 = ( sent3 = (
f"In terms of country performance, {first['country_name']} was the only " f"In terms of country performance, {first['country_name']} was the only "
@@ -234,15 +267,11 @@ def _build_overview_narrative(
f"{_fmt_score(last['score'])} in {year}." f"{_fmt_score(last['score'])} in {year}."
) )
else: else:
middle_parts = [ middle_parts = [f"{c['country_name']} ({_fmt_score(c['score'])})" for c in middle]
f"{c['country_name']} ({_fmt_score(c['score'])})" middle_str = (
for c in middle middle_parts[0] if len(middle_parts) == 1
] else ", ".join(middle_parts[:-1]) + f", and {middle_parts[-1]}"
if len(middle_parts) == 1: )
middle_str = middle_parts[0]
else:
middle_str = ", ".join(middle_parts[:-1]) + f", and {middle_parts[-1]}"
sent3 = ( sent3 = (
f"In terms of country performance, {first['country_name']} led the region " f"In terms of country performance, {first['country_name']} led the region "
f"with a score of {_fmt_score(first['score'])}, followed by {middle_str}. " f"with a score of {_fmt_score(first['score'])}, followed by {middle_str}. "
@@ -277,24 +306,11 @@ def _build_overview_narrative(
def _build_pillar_narrative( def _build_pillar_narrative(
year: int, year, pillar_name, pillar_score, rank_in_year, n_pillars, yoy_val,
pillar_name: str, top_country, top_country_score, bot_country, bot_country_score,
pillar_score: float, strongest_pillar, strongest_score, weakest_pillar, weakest_score,
rank_in_year: int, most_improved_pillar, most_improved_delta,
n_pillars: int, most_declined_pillar, most_declined_delta,
yoy_val,
top_country,
top_country_score,
bot_country,
bot_country_score,
strongest_pillar,
strongest_score,
weakest_pillar,
weakest_score,
most_improved_pillar,
most_improved_delta,
most_declined_pillar,
most_declined_delta,
) -> str: ) -> str:
rank_suffix = {1: "st", 2: "nd", 3: "rd"}.get(rank_in_year, "th") rank_suffix = {1: "st", 2: "nd", 3: "rd"}.get(rank_in_year, "th")
sent1 = ( sent1 = (
@@ -392,7 +408,7 @@ class FoodSecurityAggregator:
self.sdgs_indicator_ids = set() self.sdgs_indicator_ids = set()
# ========================================================================= # =========================================================================
# STEP 1: Load data dari Gold layer # STEP 1: Load data
# ========================================================================= # =========================================================================
def load_data(self): def load_data(self):
@@ -409,14 +425,12 @@ class FoodSecurityAggregator:
"country_id", "country_name", "country_id", "country_name",
"indicator_id", "indicator_name", "direction", "framework", "indicator_id", "indicator_name", "direction", "framework",
"pillar_id", "pillar_name", "pillar_id", "pillar_name",
"time_id", "year", "time_id", "year", "value",
"value",
} }
missing_cols = required_cols - set(self.df.columns) missing_cols = required_cols - set(self.df.columns)
if missing_cols: if missing_cols:
raise ValueError( raise ValueError(
f"Kolom berikut tidak ditemukan di fact_asean_food_security_selected: " f"Kolom berikut tidak ditemukan: {missing_cols}\n"
f"{missing_cols}\n"
f"Pastikan pipeline dijalankan berurutan:\n" f"Pastikan pipeline dijalankan berurutan:\n"
f" 1. bigquery_cleaned_layer.py\n" f" 1. bigquery_cleaned_layer.py\n"
f" 2. bigquery_dimensional_model.py\n" f" 2. bigquery_dimensional_model.py\n"
@@ -424,69 +438,35 @@ class FoodSecurityAggregator:
f" 4. bigquery_analysis_layer.py (file ini)" f" 4. bigquery_analysis_layer.py (file ini)"
) )
n_null_dir = self.df["direction"].isna().sum()
if n_null_dir > 0:
self.logger.warning(
f" [DIRECTION] {n_null_dir} rows dengan direction NULL -> diisi 'positive'"
)
self.df["direction"] = self.df["direction"].fillna("positive") self.df["direction"] = self.df["direction"].fillna("positive")
n_null_fw = self.df["framework"].isna().sum()
if n_null_fw > 0:
self.logger.warning(
f" [FRAMEWORK] {n_null_fw} rows dengan framework NULL -> diisi 'MDGs'"
)
self.df["framework"] = self.df["framework"].fillna("MDGs") self.df["framework"] = self.df["framework"].fillna("MDGs")
dir_dist = self.df.drop_duplicates("indicator_id")["direction"].value_counts() dir_dist = self.df.drop_duplicates("indicator_id")["direction"].value_counts()
self.logger.info(f"\n Distribusi direction per indikator:") self.logger.info(f"\n Distribusi direction per indikator:")
for d, cnt in dir_dist.items(): for d, cnt in dir_dist.items():
tag = "INVERT" if _should_invert(d, self.logger, "load_data check") else "normal" tag = "INVERT" if _should_invert(d, self.logger, "load_data") else "normal"
self.logger.info(f" {d:<25} : {cnt:>3} indikator [{tag}]") self.logger.info(f" {d:<25} : {cnt:>3} [{tag}]")
fw_dist = self.df.drop_duplicates("indicator_id")["framework"].value_counts() fw_dist = self.df.drop_duplicates("indicator_id")["framework"].value_counts()
self.logger.info(f"\n Distribusi framework per indikator:") self.logger.info(f"\n Distribusi framework per indikator:")
for fw, cnt in fw_dist.items(): for fw, cnt in fw_dist.items():
self.logger.info(f" {fw:<10} : {cnt:>3} indikator") self.logger.info(f" {fw:<10} : {cnt:>3}")
self.logger.info(f"\n Rows loaded : {len(self.df):,}")
self.logger.info(f" Negara : {self.df['country_id'].nunique()}")
self.logger.info(f" Indikator : {self.df['indicator_id'].nunique()}")
self.logger.info( self.logger.info(
f"\n Rows: {len(self.df):,} | Negara: {self.df['country_id'].nunique()} | "
f"Indikator: {self.df['indicator_id'].nunique()} | "
f"Tahun: {int(self.df['year'].min())}-{int(self.df['year'].max())}" f"Tahun: {int(self.df['year'].min())}-{int(self.df['year'].max())}"
) )
# ========================================================================= # =========================================================================
# STEP 1b: Klasifikasi indikator ke MDGs / SDGs # STEP 1b: Klasifikasi indikator
# ========================================================================= # =========================================================================
def _classify_indicators(self): def _classify_indicators(self):
"""
Klasifikasi indikator ke MDGs / SDGs.
UPDATED: Membaca kolom 'framework' langsung dari tabel
fact_asean_food_security_selected — tidak lagi menggunakan heuristik
gap detection berdasarkan min_year. Klasifikasi eksplisit sudah dilakukan
di bigquery_cleaned_layer.py berdasarkan daftar resmi SDG Goal 2.
sdgs_start_year dihitung dari tahun minimum data SDG yang tersedia,
bukan dari asumsi threshold hardcoded.
"""
self.logger.info("\n" + "=" * 70) self.logger.info("\n" + "=" * 70)
self.logger.info("STEP 1b: KLASIFIKASI INDIKATOR -> MDGs / SDGs") self.logger.info("STEP 1b: KLASIFIKASI INDIKATOR -> MDGs / SDGs")
self.logger.info("=" * 70) self.logger.info("=" * 70)
if "framework" not in self.df.columns:
raise ValueError(
"Kolom 'framework' tidak ditemukan di fact_asean_food_security_selected.\n"
"Pastikan pipeline dijalankan berurutan:\n"
" 1. bigquery_cleaned_layer.py (assign_framework)\n"
" 2. bigquery_dimensional_model.py (dim_indicator + framework)\n"
" 3. bigquery_analytical_layer.py (propagasi ke fact_selected)\n"
" 4. bigquery_analysis_layer.py (file ini)"
)
# Baca langsung dari kolom — tidak ada gap detection / heuristik
self.mdgs_indicator_ids = set( self.mdgs_indicator_ids = set(
self.df[self.df["framework"] == "MDGs"]["indicator_id"].unique().tolist() self.df[self.df["framework"] == "MDGs"]["indicator_id"].unique().tolist()
) )
@@ -494,24 +474,41 @@ class FoodSecurityAggregator:
self.df[self.df["framework"] == "SDGs"]["indicator_id"].unique().tolist() self.df[self.df["framework"] == "SDGs"]["indicator_id"].unique().tolist()
) )
# sdgs_start_year: tahun pertama kemunculan data SDG di dataset # sdgs_start_year: ambil dari proxy SDGs-only (FIES/anaemia)
# Digunakan untuk memisahkan era pre-SDG (MDGs only) dan era campuran (MDGs + SDGs) # Konsisten dengan cara analytical_layer mendeteksinya
_PROXY_KW = frozenset(['food insecurity', 'anemia', 'anaemia'])
proxy_mask = (
(self.df["framework"] == "SDGs") &
self.df["indicator_name"].str.lower().apply(
lambda n: any(kw in n for kw in _PROXY_KW)
)
)
df_proxy = self.df[proxy_mask]
if not df_proxy.empty:
self.sdgs_start_year = int(df_proxy["year"].min())
self.logger.info(
f"\n sdgs_start_year = {self.sdgs_start_year} "
f"(dari proxy FIES/anaemia di tabel)"
)
else:
# Fallback: min year dari semua SDGs rows
sdgs_rows = self.df[self.df["framework"] == "SDGs"] sdgs_rows = self.df[self.df["framework"] == "SDGs"]
if not sdgs_rows.empty: if not sdgs_rows.empty:
self.sdgs_start_year = int(sdgs_rows["year"].min()) self.sdgs_start_year = int(sdgs_rows["year"].min())
self.logger.warning(
f" [WARN] Proxy tidak ditemukan, fallback ke min(year) SDGs: "
f"{self.sdgs_start_year}"
)
else: else:
# Tidak ada SDG sama sekali — set ke tahun setelah akhir data
self.sdgs_start_year = int(self.df["year"].max()) + 1 self.sdgs_start_year = int(self.df["year"].max()) + 1
self.logger.warning( self.logger.warning(
f" [WARN] Tidak ada indikator SDGs. sdgs_start_year = {self.sdgs_start_year}" f" [WARN] Tidak ada SDGs. sdgs_start_year = {self.sdgs_start_year}"
) )
self.logger.info(f"\n Sumber klasifikasi : kolom 'framework' dari tabel")
self.logger.info(f" MDGs : {len(self.mdgs_indicator_ids)} indikator") self.logger.info(f" MDGs : {len(self.mdgs_indicator_ids)} indikator")
self.logger.info(f" SDGs : {len(self.sdgs_indicator_ids)} indikator") self.logger.info(f" SDGs : {len(self.sdgs_indicator_ids)} indikator")
self.logger.info(f" sdgs_start_year : {self.sdgs_start_year} (dari data aktual)")
# Log detail per framework untuk verifikasi
for fw in ["MDGs", "SDGs"]: for fw in ["MDGs", "SDGs"]:
fw_inds = ( fw_inds = (
self.df[self.df["framework"] == fw] self.df[self.df["framework"] == fw]
@@ -523,15 +520,10 @@ class FoodSecurityAggregator:
self.logger.info(f" [{int(row['indicator_id'])}] {row['indicator_name']}") self.logger.info(f" [{int(row['indicator_id'])}] {row['indicator_name']}")
# ========================================================================= # =========================================================================
# CORE HELPER: normalisasi raw value per indikator # CORE HELPER: normalisasi 0-1 per indikator (untuk composite score)
# ========================================================================= # =========================================================================
def _get_norm_value_df(self) -> pd.DataFrame: def _get_norm_value_df(self) -> pd.DataFrame:
if "framework" not in self.df.columns:
raise ValueError(
"Kolom 'framework' tidak ada. Pastikan _classify_indicators() dipanggil lebih dulu."
)
norm_parts = [] norm_parts = []
for ind_id, grp in self.df.groupby("indicator_id"): for ind_id, grp in self.df.groupby("indicator_id"):
grp = grp.copy() grp = grp.copy()
@@ -548,6 +540,7 @@ class FoodSecurityAggregator:
raw = grp.loc[valid_mask, "value"].values raw = grp.loc[valid_mask, "value"].values
v_min, v_max = raw.min(), raw.max() v_min, v_max = raw.min(), raw.max()
normed = np.full(len(grp), np.nan) normed = np.full(len(grp), np.nan)
if v_min == v_max: if v_min == v_max:
normed[valid_mask.values] = 0.5 normed[valid_mask.values] = 0.5
else: else:
@@ -562,14 +555,14 @@ class FoodSecurityAggregator:
return pd.concat(norm_parts, ignore_index=True) return pd.concat(norm_parts, ignore_index=True)
# ========================================================================= # =========================================================================
# STEP 2: agg_pillar_composite -> Gold # STEP 2: agg_pillar_composite
# ========================================================================= # =========================================================================
def calc_pillar_composite(self) -> pd.DataFrame: def calc_pillar_composite(self) -> pd.DataFrame:
table_name = "agg_pillar_composite" table_name = "agg_pillar_composite"
self.load_metadata[table_name]["start_time"] = datetime.now() self.load_metadata[table_name]["start_time"] = datetime.now()
self.logger.info("\n" + "=" * 70) self.logger.info("\n" + "=" * 70)
self.logger.info(f"STEP 2: {table_name} -> [Gold] fs_asean_gold") self.logger.info(f"STEP 2: {table_name}")
self.logger.info("=" * 70) self.logger.info("=" * 70)
df_normed = self._get_norm_value_df() df_normed = self._get_norm_value_df()
@@ -592,6 +585,8 @@ class FoodSecurityAggregator:
.astype(int) .astype(int)
) )
df = add_yoy(df, ["pillar_id"], "pillar_score_1_100") df = add_yoy(df, ["pillar_id"], "pillar_score_1_100")
df = add_condition_column(df, "pillar_score_1_100")
log_condition_summary(df, table_name, self.logger)
df["pillar_id"] = df["pillar_id"].astype(int) df["pillar_id"] = df["pillar_id"].astype(int)
df["year"] = df["year"].astype(int) df["year"] = df["year"].astype(int)
@@ -611,6 +606,7 @@ class FoodSecurityAggregator:
bigquery.SchemaField("pillar_score_1_100", "FLOAT", mode="REQUIRED"), bigquery.SchemaField("pillar_score_1_100", "FLOAT", mode="REQUIRED"),
bigquery.SchemaField("rank_in_year", "INTEGER", mode="REQUIRED"), bigquery.SchemaField("rank_in_year", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("year_over_year_change", "FLOAT", mode="NULLABLE"), bigquery.SchemaField("year_over_year_change", "FLOAT", mode="NULLABLE"),
bigquery.SchemaField("condition", "STRING", mode="NULLABLE"),
] ]
rows = load_to_bigquery( rows = load_to_bigquery(
self.client, df, table_name, layer='gold', self.client, df, table_name, layer='gold',
@@ -620,14 +616,14 @@ class FoodSecurityAggregator:
return df return df
# ========================================================================= # =========================================================================
# STEP 3: agg_pillar_by_country -> Gold # STEP 3: agg_pillar_by_country
# ========================================================================= # =========================================================================
def calc_pillar_by_country(self) -> pd.DataFrame: def calc_pillar_by_country(self) -> pd.DataFrame:
table_name = "agg_pillar_by_country" table_name = "agg_pillar_by_country"
self.load_metadata[table_name]["start_time"] = datetime.now() self.load_metadata[table_name]["start_time"] = datetime.now()
self.logger.info("\n" + "=" * 70) self.logger.info("\n" + "=" * 70)
self.logger.info(f"STEP 3: {table_name} -> [Gold] fs_asean_gold") self.logger.info(f"STEP 3: {table_name}")
self.logger.info("=" * 70) self.logger.info("=" * 70)
df_normed = self._get_norm_value_df() df_normed = self._get_norm_value_df()
@@ -646,6 +642,8 @@ class FoodSecurityAggregator:
.astype(int) .astype(int)
) )
df = add_yoy(df, ["country_id", "pillar_id"], "pillar_country_score_1_100") df = add_yoy(df, ["country_id", "pillar_id"], "pillar_country_score_1_100")
df = add_condition_column(df, "pillar_country_score_1_100")
log_condition_summary(df, table_name, self.logger)
df["country_id"] = df["country_id"].astype(int) df["country_id"] = df["country_id"].astype(int)
df["pillar_id"] = df["pillar_id"].astype(int) df["pillar_id"] = df["pillar_id"].astype(int)
@@ -664,6 +662,7 @@ class FoodSecurityAggregator:
bigquery.SchemaField("pillar_country_score_1_100", "FLOAT", mode="REQUIRED"), bigquery.SchemaField("pillar_country_score_1_100", "FLOAT", mode="REQUIRED"),
bigquery.SchemaField("rank_in_pillar_year", "INTEGER", mode="REQUIRED"), bigquery.SchemaField("rank_in_pillar_year", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("year_over_year_change", "FLOAT", mode="NULLABLE"), bigquery.SchemaField("year_over_year_change", "FLOAT", mode="NULLABLE"),
bigquery.SchemaField("condition", "STRING", mode="NULLABLE"),
] ]
rows = load_to_bigquery( rows = load_to_bigquery(
self.client, df, table_name, layer='gold', self.client, df, table_name, layer='gold',
@@ -673,11 +672,10 @@ class FoodSecurityAggregator:
return df return df
# ========================================================================= # =========================================================================
# STEP 4: agg_framework_by_country -> Gold # STEP 4: agg_framework_by_country
# ========================================================================= # =========================================================================
def _calc_country_composite_inmemory(self) -> pd.DataFrame: def _calc_country_composite_inmemory(self) -> pd.DataFrame:
"""Hitung country composite in-memory (tidak disimpan ke BQ)."""
df_normed = self._get_norm_value_df() df_normed = self._get_norm_value_df()
df = ( df = (
df_normed df_normed
@@ -707,7 +705,7 @@ class FoodSecurityAggregator:
table_name = "agg_framework_by_country" table_name = "agg_framework_by_country"
self.load_metadata[table_name]["start_time"] = datetime.now() self.load_metadata[table_name]["start_time"] = datetime.now()
self.logger.info("\n" + "=" * 70) self.logger.info("\n" + "=" * 70)
self.logger.info(f"STEP 4: {table_name} -> [Gold] fs_asean_gold") self.logger.info(f"STEP 4: {table_name}")
self.logger.info("=" * 70) self.logger.info("=" * 70)
country_composite = self._calc_country_composite_inmemory() country_composite = self._calc_country_composite_inmemory()
@@ -729,10 +727,8 @@ class FoodSecurityAggregator:
agg_total["framework"] = "Total" agg_total["framework"] = "Total"
parts.append(agg_total) parts.append(agg_total)
# Layer MDGs — Era pre-SDGs = Total # Layer MDGs pre-SDGs
pre_sdgs_rows = country_composite[ pre_sdgs_rows = country_composite[country_composite["year"] < self.sdgs_start_year].copy()
country_composite["year"] < self.sdgs_start_year
].copy()
if not pre_sdgs_rows.empty: if not pre_sdgs_rows.empty:
mdgs_pre = ( mdgs_pre = (
pre_sdgs_rows[[ pre_sdgs_rows[[
@@ -748,7 +744,7 @@ class FoodSecurityAggregator:
mdgs_pre["framework"] = "MDGs" mdgs_pre["framework"] = "MDGs"
parts.append(mdgs_pre) parts.append(mdgs_pre)
# Layer MDGs — Era mixed (setelah SDGs masuk) # Layer MDGs mixed (setelah SDGs masuk)
if self.mdgs_indicator_ids: if self.mdgs_indicator_ids:
df_mdgs_mixed = df_normed[ df_mdgs_mixed = df_normed[
(df_normed["indicator_id"].isin(self.mdgs_indicator_ids)) & (df_normed["indicator_id"].isin(self.mdgs_indicator_ids)) &
@@ -758,16 +754,11 @@ class FoodSecurityAggregator:
agg_mdgs_mixed = ( agg_mdgs_mixed = (
df_mdgs_mixed df_mdgs_mixed
.groupby(["country_id", "country_name", "year"]) .groupby(["country_id", "country_name", "year"])
.agg( .agg(framework_norm=("norm_value", "mean"), n_indicators=("indicator_id", "nunique"))
framework_norm=("norm_value", "mean"),
n_indicators =("indicator_id", "nunique")
)
.reset_index() .reset_index()
) )
if not NORMALIZE_FRAMEWORKS_JOINTLY: if not NORMALIZE_FRAMEWORKS_JOINTLY:
agg_mdgs_mixed["framework_score_1_100"] = global_minmax( agg_mdgs_mixed["framework_score_1_100"] = global_minmax(agg_mdgs_mixed["framework_norm"])
agg_mdgs_mixed["framework_norm"]
)
agg_mdgs_mixed["framework"] = "MDGs" agg_mdgs_mixed["framework"] = "MDGs"
parts.append(agg_mdgs_mixed) parts.append(agg_mdgs_mixed)
@@ -781,40 +772,30 @@ class FoodSecurityAggregator:
agg_sdgs = ( agg_sdgs = (
df_sdgs df_sdgs
.groupby(["country_id", "country_name", "year"]) .groupby(["country_id", "country_name", "year"])
.agg( .agg(framework_norm=("norm_value", "mean"), n_indicators=("indicator_id", "nunique"))
framework_norm=("norm_value", "mean"),
n_indicators =("indicator_id", "nunique")
)
.reset_index() .reset_index()
) )
if not NORMALIZE_FRAMEWORKS_JOINTLY: if not NORMALIZE_FRAMEWORKS_JOINTLY:
agg_sdgs["framework_score_1_100"] = global_minmax( agg_sdgs["framework_score_1_100"] = global_minmax(agg_sdgs["framework_norm"])
agg_sdgs["framework_norm"]
)
agg_sdgs["framework"] = "SDGs" agg_sdgs["framework"] = "SDGs"
parts.append(agg_sdgs) parts.append(agg_sdgs)
df = pd.concat(parts, ignore_index=True) df = pd.concat(parts, ignore_index=True)
if NORMALIZE_FRAMEWORKS_JOINTLY: if NORMALIZE_FRAMEWORKS_JOINTLY:
mixed_mask = ( mixed_mask = (df["framework"].isin(["MDGs", "SDGs"])) & (df["year"] >= self.sdgs_start_year)
(df["framework"].isin(["MDGs", "SDGs"])) &
(df["year"] >= self.sdgs_start_year)
)
if mixed_mask.any(): if mixed_mask.any():
df.loc[mixed_mask, "framework_score_1_100"] = global_minmax( df.loc[mixed_mask, "framework_score_1_100"] = global_minmax(df.loc[mixed_mask, "framework_norm"])
df.loc[mixed_mask, "framework_norm"]
)
df = check_and_dedup( df = check_and_dedup(df, ["country_id", "framework", "year"], context=table_name, logger=self.logger)
df, ["country_id", "framework", "year"], context=table_name, logger=self.logger
)
df["rank_in_framework_year"] = ( df["rank_in_framework_year"] = (
df.groupby(["framework", "year"])["framework_score_1_100"] df.groupby(["framework", "year"])["framework_score_1_100"]
.rank(method="min", ascending=False) .rank(method="min", ascending=False)
.astype(int) .astype(int)
) )
df = add_yoy(df, ["country_id", "framework"], "framework_score_1_100") df = add_yoy(df, ["country_id", "framework"], "framework_score_1_100")
df = add_condition_column(df, "framework_score_1_100")
log_condition_summary(df, table_name, self.logger)
df["country_id"] = df["country_id"].astype(int) df["country_id"] = df["country_id"].astype(int)
df["year"] = df["year"].astype(int) df["year"] = df["year"].astype(int)
@@ -835,6 +816,7 @@ class FoodSecurityAggregator:
bigquery.SchemaField("framework_score_1_100", "FLOAT", mode="REQUIRED"), bigquery.SchemaField("framework_score_1_100", "FLOAT", mode="REQUIRED"),
bigquery.SchemaField("rank_in_framework_year", "INTEGER", mode="REQUIRED"), bigquery.SchemaField("rank_in_framework_year", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("year_over_year_change", "FLOAT", mode="NULLABLE"), bigquery.SchemaField("year_over_year_change", "FLOAT", mode="NULLABLE"),
bigquery.SchemaField("condition", "STRING", mode="NULLABLE"),
] ]
rows = load_to_bigquery( rows = load_to_bigquery(
self.client, df, table_name, layer='gold', self.client, df, table_name, layer='gold',
@@ -844,14 +826,14 @@ class FoodSecurityAggregator:
return df return df
# ========================================================================= # =========================================================================
# STEP 5: agg_framework_asean -> Gold # STEP 5: agg_framework_asean
# ========================================================================= # =========================================================================
def calc_framework_asean(self) -> pd.DataFrame: def calc_framework_asean(self) -> pd.DataFrame:
table_name = "agg_framework_asean" table_name = "agg_framework_asean"
self.load_metadata[table_name]["start_time"] = datetime.now() self.load_metadata[table_name]["start_time"] = datetime.now()
self.logger.info("\n" + "=" * 70) self.logger.info("\n" + "=" * 70)
self.logger.info(f"STEP 5: {table_name} -> [Gold] fs_asean_gold") self.logger.info(f"STEP 5: {table_name}")
self.logger.info("=" * 70) self.logger.info("=" * 70)
df_normed = self._get_norm_value_df() df_normed = self._get_norm_value_df()
@@ -865,45 +847,30 @@ class FoodSecurityAggregator:
) )
asean_overall = ( asean_overall = (
country_norm.groupby("year") country_norm.groupby("year")
.agg( .agg(asean_norm=("country_norm", "mean"), std_norm=("country_norm", "std"), n_countries=("country_norm", "count"))
asean_norm =("country_norm", "mean"),
std_norm =("country_norm", "std"),
n_countries =("country_norm", "count")
)
.reset_index() .reset_index()
) )
asean_overall["asean_score_1_100"] = global_minmax(asean_overall["asean_norm"]) asean_overall["asean_score_1_100"] = global_minmax(asean_overall["asean_norm"])
asean_comp = (
country_composite.groupby("year")["composite_score"]
.mean().reset_index()
.rename(columns={"composite_score": "asean_composite"})
)
asean_overall = asean_overall.merge(asean_comp, on="year", how="left")
parts = [] parts = []
# Layer TOTAL # Layer TOTAL
total_cols = asean_overall[[ total_cols = asean_overall[["year", "asean_score_1_100", "asean_norm", "std_norm", "n_countries"]].copy()
"year", "asean_score_1_100", "asean_norm", "std_norm", "n_countries" total_cols = total_cols.rename(columns={
]].copy().rename(columns={
"asean_score_1_100": "framework_score_1_100", "asean_score_1_100": "framework_score_1_100",
"asean_norm" : "framework_norm", "asean_norm" : "framework_norm",
"n_countries" : "n_countries_with_data", "n_countries" : "n_countries_with_data",
}) })
n_ind_total = ( n_ind_total = df_normed.groupby("year")["indicator_id"].nunique().reset_index().rename(columns={"indicator_id": "n_indicators"})
df_normed.groupby("year")["indicator_id"].nunique()
.reset_index().rename(columns={"indicator_id": "n_indicators"})
)
total_cols = total_cols.merge(n_ind_total, on="year", how="left") total_cols = total_cols.merge(n_ind_total, on="year", how="left")
total_cols["framework"] = "Total" total_cols["framework"] = "Total"
parts.append(total_cols) parts.append(total_cols)
# Layer MDGs pre-SDGs = Total # Layer MDGs pre-SDGs
pre_sdgs = asean_overall[asean_overall["year"] < self.sdgs_start_year].copy() pre_sdgs = asean_overall[asean_overall["year"] < self.sdgs_start_year].copy()
if not pre_sdgs.empty: if not pre_sdgs.empty:
mdgs_pre = pre_sdgs[[ mdgs_pre = pre_sdgs[["year", "asean_score_1_100", "asean_norm", "std_norm", "n_countries"]].copy()
"year", "asean_score_1_100", "asean_norm", "std_norm", "n_countries" mdgs_pre = mdgs_pre.rename(columns={
]].copy().rename(columns={
"asean_score_1_100": "framework_score_1_100", "asean_score_1_100": "framework_score_1_100",
"asean_norm" : "framework_norm", "asean_norm" : "framework_norm",
"n_countries" : "n_countries_with_data", "n_countries" : "n_countries_with_data",
@@ -917,7 +884,7 @@ class FoodSecurityAggregator:
mdgs_pre["framework"] = "MDGs" mdgs_pre["framework"] = "MDGs"
parts.append(mdgs_pre) parts.append(mdgs_pre)
# Layer MDGs mixed # Layer MDGs mixed
if self.mdgs_indicator_ids: if self.mdgs_indicator_ids:
df_mdgs_mixed = df_normed[ df_mdgs_mixed = df_normed[
(df_normed["indicator_id"].isin(self.mdgs_indicator_ids)) & (df_normed["indicator_id"].isin(self.mdgs_indicator_ids)) &
@@ -925,8 +892,7 @@ class FoodSecurityAggregator:
].copy() ].copy()
if not df_mdgs_mixed.empty: if not df_mdgs_mixed.empty:
cn = ( cn = (
df_mdgs_mixed df_mdgs_mixed.groupby(["country_id", "year"])["norm_value"].mean()
.groupby(["country_id", "year"])["norm_value"].mean()
.reset_index().rename(columns={"norm_value": "country_norm"}) .reset_index().rename(columns={"norm_value": "country_norm"})
) )
asean_mdgs = cn.groupby("year").agg( asean_mdgs = cn.groupby("year").agg(
@@ -934,15 +900,10 @@ class FoodSecurityAggregator:
std_norm =("country_norm", "std"), std_norm =("country_norm", "std"),
n_countries_with_data =("country_id", "count"), n_countries_with_data =("country_id", "count"),
).reset_index() ).reset_index()
n_ind_mdgs = ( n_ind_mdgs = df_mdgs_mixed.groupby("year")["indicator_id"].nunique().reset_index().rename(columns={"indicator_id": "n_indicators"})
df_mdgs_mixed.groupby("year")["indicator_id"].nunique()
.reset_index().rename(columns={"indicator_id": "n_indicators"})
)
asean_mdgs = asean_mdgs.merge(n_ind_mdgs, on="year", how="left") asean_mdgs = asean_mdgs.merge(n_ind_mdgs, on="year", how="left")
if not NORMALIZE_FRAMEWORKS_JOINTLY: if not NORMALIZE_FRAMEWORKS_JOINTLY:
asean_mdgs["framework_score_1_100"] = global_minmax( asean_mdgs["framework_score_1_100"] = global_minmax(asean_mdgs["framework_norm"])
asean_mdgs["framework_norm"]
)
asean_mdgs["framework"] = "MDGs" asean_mdgs["framework"] = "MDGs"
parts.append(asean_mdgs) parts.append(asean_mdgs)
@@ -954,8 +915,7 @@ class FoodSecurityAggregator:
].copy() ].copy()
if not df_sdgs.empty: if not df_sdgs.empty:
cn = ( cn = (
df_sdgs df_sdgs.groupby(["country_id", "year"])["norm_value"].mean()
.groupby(["country_id", "year"])["norm_value"].mean()
.reset_index().rename(columns={"norm_value": "country_norm"}) .reset_index().rename(columns={"norm_value": "country_norm"})
) )
asean_sdgs = cn.groupby("year").agg( asean_sdgs = cn.groupby("year").agg(
@@ -963,34 +923,24 @@ class FoodSecurityAggregator:
std_norm =("country_norm", "std"), std_norm =("country_norm", "std"),
n_countries_with_data =("country_id", "count"), n_countries_with_data =("country_id", "count"),
).reset_index() ).reset_index()
n_ind_sdgs = ( n_ind_sdgs = df_sdgs.groupby("year")["indicator_id"].nunique().reset_index().rename(columns={"indicator_id": "n_indicators"})
df_sdgs.groupby("year")["indicator_id"].nunique()
.reset_index().rename(columns={"indicator_id": "n_indicators"})
)
asean_sdgs = asean_sdgs.merge(n_ind_sdgs, on="year", how="left") asean_sdgs = asean_sdgs.merge(n_ind_sdgs, on="year", how="left")
if not NORMALIZE_FRAMEWORKS_JOINTLY: if not NORMALIZE_FRAMEWORKS_JOINTLY:
asean_sdgs["framework_score_1_100"] = global_minmax( asean_sdgs["framework_score_1_100"] = global_minmax(asean_sdgs["framework_norm"])
asean_sdgs["framework_norm"]
)
asean_sdgs["framework"] = "SDGs" asean_sdgs["framework"] = "SDGs"
parts.append(asean_sdgs) parts.append(asean_sdgs)
df = pd.concat(parts, ignore_index=True) df = pd.concat(parts, ignore_index=True)
if NORMALIZE_FRAMEWORKS_JOINTLY: if NORMALIZE_FRAMEWORKS_JOINTLY:
mixed_mask = ( mixed_mask = (df["framework"].isin(["MDGs", "SDGs"])) & (df["year"] >= self.sdgs_start_year)
(df["framework"].isin(["MDGs", "SDGs"])) &
(df["year"] >= self.sdgs_start_year)
)
if mixed_mask.any(): if mixed_mask.any():
df.loc[mixed_mask, "framework_score_1_100"] = global_minmax( df.loc[mixed_mask, "framework_score_1_100"] = global_minmax(df.loc[mixed_mask, "framework_norm"])
df.loc[mixed_mask, "framework_norm"]
)
df = check_and_dedup( df = check_and_dedup(df, ["framework", "year"], context=table_name, logger=self.logger)
df, ["framework", "year"], context=table_name, logger=self.logger
)
df = add_yoy(df, ["framework"], "framework_score_1_100") df = add_yoy(df, ["framework"], "framework_score_1_100")
df = add_condition_column(df, "framework_score_1_100")
log_condition_summary(df, table_name, self.logger)
df["year"] = df["year"].astype(int) df["year"] = df["year"].astype(int)
df["n_indicators"] = safe_int(df["n_indicators"], col_name="n_indicators", logger=self.logger) df["n_indicators"] = safe_int(df["n_indicators"], col_name="n_indicators", logger=self.logger)
@@ -1009,6 +959,7 @@ class FoodSecurityAggregator:
bigquery.SchemaField("std_norm", "FLOAT", mode="NULLABLE"), bigquery.SchemaField("std_norm", "FLOAT", mode="NULLABLE"),
bigquery.SchemaField("framework_score_1_100", "FLOAT", mode="REQUIRED"), bigquery.SchemaField("framework_score_1_100", "FLOAT", mode="REQUIRED"),
bigquery.SchemaField("year_over_year_change", "FLOAT", mode="NULLABLE"), bigquery.SchemaField("year_over_year_change", "FLOAT", mode="NULLABLE"),
bigquery.SchemaField("condition", "STRING", mode="NULLABLE"),
] ]
rows = load_to_bigquery( rows = load_to_bigquery(
self.client, df, table_name, layer='gold', self.client, df, table_name, layer='gold',
@@ -1018,39 +969,20 @@ class FoodSecurityAggregator:
return df return df
# ========================================================================= # =========================================================================
# STEP 6: agg_narrative_overview -> Gold # STEP 6 & 7: Narrative (tidak ada perubahan)
# ========================================================================= # =========================================================================
def calc_narrative_overview( def calc_narrative_overview(self, df_framework_asean, df_framework_by_country):
self,
df_framework_asean: pd.DataFrame,
df_framework_by_country: pd.DataFrame,
) -> pd.DataFrame:
table_name = "agg_narrative_overview" table_name = "agg_narrative_overview"
self.load_metadata[table_name]["start_time"] = datetime.now() self.load_metadata[table_name]["start_time"] = datetime.now()
self.logger.info("\n" + "=" * 70) self.logger.info("\n" + "=" * 70)
self.logger.info(f"STEP 6: {table_name} -> [Gold] fs_asean_gold") self.logger.info(f"STEP 6: {table_name}")
self.logger.info("=" * 70) self.logger.info("=" * 70)
asean_total = ( asean_total = df_framework_asean[df_framework_asean["framework"] == "Total"].sort_values("year").reset_index(drop=True)
df_framework_asean[df_framework_asean["framework"] == "Total"] score_by_year = dict(zip(asean_total["year"].astype(int), asean_total["framework_score_1_100"].astype(float)))
.sort_values("year") country_total = df_framework_by_country[df_framework_by_country["framework"] == "Total"].copy()
.reset_index(drop=True)
)
score_by_year = dict(zip(
asean_total["year"].astype(int),
asean_total["framework_score_1_100"].astype(float),
))
country_total = (
df_framework_by_country[df_framework_by_country["framework"] == "Total"]
.copy()
)
# Gunakan kolom framework dari self.df untuk hitung MDG/SDG per tahun
ind_year = self.df.drop_duplicates(subset=["indicator_id", "year", "framework"]) ind_year = self.df.drop_duplicates(subset=["indicator_id", "year", "framework"])
records = [] records = []
for _, row in asean_total.iterrows(): for _, row in asean_total.iterrows():
@@ -1063,21 +995,10 @@ class FoodSecurityAggregator:
n_mdg = int(yr_ind[yr_ind["framework"] == "MDGs"]["indicator_id"].nunique()) n_mdg = int(yr_ind[yr_ind["framework"] == "MDGs"]["indicator_id"].nunique())
n_sdg = int(yr_ind[yr_ind["framework"] == "SDGs"]["indicator_id"].nunique()) n_sdg = int(yr_ind[yr_ind["framework"] == "SDGs"]["indicator_id"].nunique())
n_total_ind = int(yr_ind["indicator_id"].nunique()) n_total_ind = int(yr_ind["indicator_id"].nunique())
prev_score = score_by_year.get(yr - 1, None) prev_score = score_by_year.get(yr - 1, None)
yoy_pct = ((yoy_val / prev_score * 100) if (yoy_val is not None and prev_score and prev_score != 0) else None)
yoy_pct = ( yr_country = country_total[country_total["year"] == yr].sort_values("rank_in_framework_year").reset_index(drop=True)
(yoy_val / prev_score * 100)
if (yoy_val is not None and prev_score is not None and prev_score != 0)
else None
)
yr_country = (
country_total[country_total["year"] == yr]
.sort_values("rank_in_framework_year")
.reset_index(drop=True)
)
ranking_list = [] ranking_list = []
for _, cr in yr_country.iterrows(): for _, cr in yr_country.iterrows():
cr_yoy = cr.get("year_over_year_change", None) cr_yoy = cr.get("year_over_year_change", None)
@@ -1087,7 +1008,6 @@ class FoodSecurityAggregator:
"score" : round(float(cr["framework_score_1_100"]), 2), "score" : round(float(cr["framework_score_1_100"]), 2),
"yoy_change" : round(float(cr_yoy), 2) if pd.notna(cr_yoy) else None, "yoy_change" : round(float(cr_yoy), 2) if pd.notna(cr_yoy) else None,
}) })
country_ranking_json = json.dumps(ranking_list, ensure_ascii=False)
yr_country_yoy = yr_country.dropna(subset=["year_over_year_change"]) yr_country_yoy = yr_country.dropna(subset=["year_over_year_change"])
if not yr_country_yoy.empty: if not yr_country_yoy.empty:
@@ -1102,20 +1022,11 @@ class FoodSecurityAggregator:
most_improved_delta = most_declined_delta = None most_improved_delta = most_declined_delta = None
narrative = _build_overview_narrative( narrative = _build_overview_narrative(
year = yr, year=yr, n_mdg=n_mdg, n_sdg=n_sdg, n_total_ind=n_total_ind,
n_mdg = n_mdg, score=score, yoy_val=yoy_val, yoy_pct=yoy_pct,
n_sdg = n_sdg, prev_year=yr-1, prev_score=prev_score, ranking_list=ranking_list,
n_total_ind = n_total_ind, most_improved_country=most_improved_country, most_improved_delta=most_improved_delta,
score = score, most_declined_country=most_declined_country, most_declined_delta=most_declined_delta,
yoy_val = yoy_val,
yoy_pct = yoy_pct,
prev_year = yr - 1,
prev_score = prev_score,
ranking_list = ranking_list,
most_improved_country = most_improved_country,
most_improved_delta = most_improved_delta,
most_declined_country = most_declined_country,
most_declined_delta = most_declined_delta,
) )
records.append({ records.append({
@@ -1126,7 +1037,7 @@ class FoodSecurityAggregator:
"asean_total_score" : round(score, 2), "asean_total_score" : round(score, 2),
"yoy_change" : yoy_val, "yoy_change" : yoy_val,
"yoy_change_pct" : round(yoy_pct, 2) if yoy_pct is not None else None, "yoy_change_pct" : round(yoy_pct, 2) if yoy_pct is not None else None,
"country_ranking_json" : country_ranking_json, "country_ranking_json" : json.dumps(ranking_list, ensure_ascii=False),
"most_improved_country": most_improved_country, "most_improved_country": most_improved_country,
"most_improved_delta" : most_improved_delta, "most_improved_delta" : most_improved_delta,
"most_declined_country": most_declined_country, "most_declined_country": most_declined_country,
@@ -1158,39 +1069,21 @@ class FoodSecurityAggregator:
bigquery.SchemaField("most_declined_delta", "FLOAT", mode="NULLABLE"), bigquery.SchemaField("most_declined_delta", "FLOAT", mode="NULLABLE"),
bigquery.SchemaField("narrative_overview", "STRING", mode="REQUIRED"), bigquery.SchemaField("narrative_overview", "STRING", mode="REQUIRED"),
] ]
rows = load_to_bigquery( rows = load_to_bigquery(self.client, df, table_name, layer='gold', write_disposition="WRITE_TRUNCATE", schema=schema)
self.client, df, table_name, layer='gold',
write_disposition="WRITE_TRUNCATE", schema=schema,
)
self._finalize(table_name, rows) self._finalize(table_name, rows)
return df return df
# ========================================================================= def calc_narrative_pillar(self, df_pillar_composite, df_pillar_by_country):
# STEP 7: agg_narrative_pillar -> Gold
# =========================================================================
def calc_narrative_pillar(
self,
df_pillar_composite: pd.DataFrame,
df_pillar_by_country: pd.DataFrame,
) -> pd.DataFrame:
table_name = "agg_narrative_pillar" table_name = "agg_narrative_pillar"
self.load_metadata[table_name]["start_time"] = datetime.now() self.load_metadata[table_name]["start_time"] = datetime.now()
self.logger.info("\n" + "=" * 70) self.logger.info("\n" + "=" * 70)
self.logger.info(f"STEP 7: {table_name} -> [Gold] fs_asean_gold") self.logger.info(f"STEP 7: {table_name}")
self.logger.info("=" * 70) self.logger.info("=" * 70)
records = [] records = []
years = sorted(df_pillar_composite["year"].unique()) for yr in sorted(df_pillar_composite["year"].unique()):
yr_pillars = df_pillar_composite[df_pillar_composite["year"] == yr].sort_values("rank_in_year").reset_index(drop=True)
for yr in years:
yr_pillars = (
df_pillar_composite[df_pillar_composite["year"] == yr]
.sort_values("rank_in_year")
.reset_index(drop=True)
)
yr_country_pillar = df_pillar_by_country[df_pillar_by_country["year"] == yr] yr_country_pillar = df_pillar_by_country[df_pillar_by_country["year"] == yr]
strongest_pillar = yr_pillars.iloc[0] if len(yr_pillars) > 0 else None strongest_pillar = yr_pillars.iloc[0] if len(yr_pillars) > 0 else None
weakest_pillar = yr_pillars.iloc[-1] if len(yr_pillars) > 0 else None weakest_pillar = yr_pillars.iloc[-1] if len(yr_pillars) > 0 else None
@@ -1208,54 +1101,37 @@ class FoodSecurityAggregator:
for _, prow in yr_pillars.iterrows(): for _, prow in yr_pillars.iterrows():
p_id = int(prow["pillar_id"]) p_id = int(prow["pillar_id"])
p_name = str(prow["pillar_name"]) p_country = yr_country_pillar[yr_country_pillar["pillar_id"] == p_id].sort_values("rank_in_pillar_year").reset_index(drop=True)
p_score = float(prow["pillar_score_1_100"]) top_country = bot_country = None
p_rank = int(prow["rank_in_year"]) top_country_score = bot_country_score = None
p_yoy = prow["year_over_year_change"]
p_yoy_val = float(p_yoy) if pd.notna(p_yoy) else None
p_country = (
yr_country_pillar[yr_country_pillar["pillar_id"] == p_id]
.sort_values("rank_in_pillar_year")
.reset_index(drop=True)
)
if not p_country.empty: if not p_country.empty:
top_country = str(p_country.iloc[0]["country_name"]) top_country = str(p_country.iloc[0]["country_name"])
top_country_score = round(float(p_country.iloc[0]["pillar_country_score_1_100"]), 2) top_country_score = round(float(p_country.iloc[0]["pillar_country_score_1_100"]), 2)
bot_country = str(p_country.iloc[-1]["country_name"]) bot_country = str(p_country.iloc[-1]["country_name"])
bot_country_score = round(float(p_country.iloc[-1]["pillar_country_score_1_100"]), 2) bot_country_score = round(float(p_country.iloc[-1]["pillar_country_score_1_100"]), 2)
else:
top_country = bot_country = None
top_country_score = bot_country_score = None
p_yoy = prow["year_over_year_change"]
narrative = _build_pillar_narrative( narrative = _build_pillar_narrative(
year = yr, year=yr, pillar_name=str(prow["pillar_name"]),
pillar_name = p_name, pillar_score=float(prow["pillar_score_1_100"]),
pillar_score = p_score, rank_in_year=int(prow["rank_in_year"]), n_pillars=len(yr_pillars),
rank_in_year = p_rank, yoy_val=float(p_yoy) if pd.notna(p_yoy) else None,
n_pillars = len(yr_pillars), top_country=top_country, top_country_score=top_country_score,
yoy_val = p_yoy_val, bot_country=bot_country, bot_country_score=bot_country_score,
top_country = top_country,
top_country_score = top_country_score,
bot_country = bot_country,
bot_country_score = bot_country_score,
strongest_pillar=str(strongest_pillar["pillar_name"]) if strongest_pillar is not None else None, strongest_pillar=str(strongest_pillar["pillar_name"]) if strongest_pillar is not None else None,
strongest_score=round(float(strongest_pillar["pillar_score_1_100"]), 2) if strongest_pillar is not None else None, strongest_score=round(float(strongest_pillar["pillar_score_1_100"]), 2) if strongest_pillar is not None else None,
weakest_pillar=str(weakest_pillar["pillar_name"]) if weakest_pillar is not None else None, weakest_pillar=str(weakest_pillar["pillar_name"]) if weakest_pillar is not None else None,
weakest_score=round(float(weakest_pillar["pillar_score_1_100"]), 2) if weakest_pillar is not None else None, weakest_score=round(float(weakest_pillar["pillar_score_1_100"]), 2) if weakest_pillar is not None else None,
most_improved_pillar = most_improved_pillar, most_improved_pillar=most_improved_pillar, most_improved_delta=most_improved_delta,
most_improved_delta = most_improved_delta, most_declined_pillar=most_declined_pillar, most_declined_delta=most_declined_delta,
most_declined_pillar = most_declined_pillar,
most_declined_delta = most_declined_delta,
) )
records.append({ records.append({
"year" : yr, "year" : yr,
"pillar_id" : p_id, "pillar_id" : p_id,
"pillar_name" : p_name, "pillar_name" : str(prow["pillar_name"]),
"pillar_score" : round(p_score, 2), "pillar_score" : round(float(prow["pillar_score_1_100"]), 2),
"rank_in_year" : p_rank, "rank_in_year" : int(prow["rank_in_year"]),
"yoy_change" : p_yoy_val, "yoy_change" : float(p_yoy) if pd.notna(p_yoy) else None,
"top_country" : top_country, "top_country" : top_country,
"top_country_score" : top_country_score, "top_country_score" : top_country_score,
"bottom_country" : bot_country, "bottom_country" : bot_country,
@@ -1283,10 +1159,7 @@ class FoodSecurityAggregator:
bigquery.SchemaField("bottom_country_score", "FLOAT", mode="NULLABLE"), bigquery.SchemaField("bottom_country_score", "FLOAT", mode="NULLABLE"),
bigquery.SchemaField("narrative_pillar", "STRING", mode="REQUIRED"), bigquery.SchemaField("narrative_pillar", "STRING", mode="REQUIRED"),
] ]
rows = load_to_bigquery( rows = load_to_bigquery(self.client, df, table_name, layer='gold', write_disposition="WRITE_TRUNCATE", schema=schema)
self.client, df, table_name, layer='gold',
write_disposition="WRITE_TRUNCATE", schema=schema,
)
self._finalize(table_name, rows) self._finalize(table_name, rows)
return df return df
@@ -1297,19 +1170,13 @@ class FoodSecurityAggregator:
def _validate_mdgs_equals_total(self, df: pd.DataFrame, level: str = ""): def _validate_mdgs_equals_total(self, df: pd.DataFrame, level: str = ""):
self.logger.info(f"\n Validasi MDGs < {self.sdgs_start_year} == Total [{level}]:") self.logger.info(f"\n Validasi MDGs < {self.sdgs_start_year} == Total [{level}]:")
group_by = ["year"] if level.startswith("asean") else ["country_id", "year"] group_by = ["year"] if level.startswith("asean") else ["country_id", "year"]
mdgs_pre = df[ mdgs_pre = df[(df["framework"] == "MDGs") & (df["year"] < self.sdgs_start_year)][group_by + ["framework_score_1_100"]].rename(columns={"framework_score_1_100": "mdgs_score"})
(df["framework"] == "MDGs") & (df["year"] < self.sdgs_start_year) total_pre = df[(df["framework"] == "Total") & (df["year"] < self.sdgs_start_year)][group_by + ["framework_score_1_100"]].rename(columns={"framework_score_1_100": "total_score"})
][group_by + ["framework_score_1_100"]].rename(columns={"framework_score_1_100": "mdgs_score"})
total_pre = df[
(df["framework"] == "Total") & (df["year"] < self.sdgs_start_year)
][group_by + ["framework_score_1_100"]].rename(columns={"framework_score_1_100": "total_score"})
if mdgs_pre.empty and total_pre.empty: if mdgs_pre.empty and total_pre.empty:
self.logger.info(f" -> Tidak ada data pre-{self.sdgs_start_year} (skip)") self.logger.info(f" -> Tidak ada data pre-{self.sdgs_start_year} (skip)")
return return
if mdgs_pre.empty or total_pre.empty: if mdgs_pre.empty or total_pre.empty:
self.logger.warning( self.logger.warning(f" -> [WARNING] Salah satu kosong: MDGs={len(mdgs_pre)}, Total={len(total_pre)}")
f" -> [WARNING] Salah satu kosong: MDGs={len(mdgs_pre)}, Total={len(total_pre)}"
)
return return
check = mdgs_pre.merge(total_pre, on=group_by) check = mdgs_pre.merge(total_pre, on=group_by)
max_diff = (check["mdgs_score"] - check["total_score"]).abs().max() max_diff = (check["mdgs_score"] - check["total_score"]).abs().max()
@@ -1317,12 +1184,9 @@ class FoodSecurityAggregator:
self.logger.info(f" -> {status} (n_checked={len(check)})") self.logger.info(f" -> {status} (n_checked={len(check)})")
def _finalize(self, table_name: str, rows_loaded: int): def _finalize(self, table_name: str, rows_loaded: int):
self.load_metadata[table_name].update({ self.load_metadata[table_name].update({"rows_loaded": rows_loaded, "status": "success", "end_time": datetime.now()})
"rows_loaded": rows_loaded, "status": "success", "end_time": datetime.now(),
})
log_update(self.client, "DW", table_name, "full_load", rows_loaded) log_update(self.client, "DW", table_name, "full_load", rows_loaded)
self.logger.info(f" {table_name}: {rows_loaded:,} rows -> [Gold] fs_asean_gold") self.logger.info(f" {table_name}: {rows_loaded:,} rows -> [Gold] fs_asean_gold")
self.logger.info(f" Metadata -> [AUDIT] etl_logs")
def _fail(self, table_name: str, error: Exception): def _fail(self, table_name: str, error: Exception):
self.load_metadata[table_name].update({"status": "failed", "end_time": datetime.now()}) self.load_metadata[table_name].update({"status": "failed", "end_time": datetime.now()})
@@ -1337,12 +1201,7 @@ class FoodSecurityAggregator:
start = datetime.now() start = datetime.now()
self.logger.info("\n" + "=" * 70) self.logger.info("\n" + "=" * 70)
self.logger.info("FOOD SECURITY AGGREGATION — 6 TABLES -> fs_asean_gold") self.logger.info("FOOD SECURITY AGGREGATION — 6 TABLES -> fs_asean_gold")
self.logger.info(" Source : fact_asean_food_security_selected") self.logger.info(f" Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
self.logger.info(" Outputs : agg_pillar_composite | agg_pillar_by_country")
self.logger.info(" agg_framework_by_country| agg_framework_asean")
self.logger.info(" agg_narrative_overview | agg_narrative_pillar")
self.logger.info(" NOTE : framework (MDGs/SDGs) dibaca dari kolom tabel,")
self.logger.info(" bukan heuristik gap min_year")
self.logger.info("=" * 70) self.logger.info("=" * 70)
self.load_data() self.load_data()
@@ -1352,15 +1211,8 @@ class FoodSecurityAggregator:
df_pillar_by_country = self.calc_pillar_by_country() df_pillar_by_country = self.calc_pillar_by_country()
df_framework_by_country = self.calc_framework_by_country() df_framework_by_country = self.calc_framework_by_country()
df_framework_asean = self.calc_framework_asean() df_framework_asean = self.calc_framework_asean()
self.calc_narrative_overview(df_framework_asean=df_framework_asean, df_framework_by_country=df_framework_by_country)
self.calc_narrative_overview( self.calc_narrative_pillar(df_pillar_composite=df_pillar_composite, df_pillar_by_country=df_pillar_by_country)
df_framework_asean = df_framework_asean,
df_framework_by_country = df_framework_by_country,
)
self.calc_narrative_pillar(
df_pillar_composite = df_pillar_composite,
df_pillar_by_country = df_pillar_by_country,
)
duration = (datetime.now() - start).total_seconds() duration = (datetime.now() - start).total_seconds()
total_rows = sum(m["rows_loaded"] for m in self.load_metadata.values()) total_rows = sum(m["rows_loaded"] for m in self.load_metadata.values())
@@ -1376,14 +1228,10 @@ class FoodSecurityAggregator:
# ============================================================================= # =============================================================================
# AIRFLOW TASK FUNCTIONS # AIRFLOW & MAIN
# ============================================================================= # =============================================================================
def run_aggregation(): def run_aggregation():
"""
Airflow task: Hitung semua agregasi dari fact_asean_food_security_selected.
Dipanggil setelah analytical_layer_to_gold selesai.
"""
from scripts.bigquery_config import get_bigquery_client from scripts.bigquery_config import get_bigquery_client
client = get_bigquery_client() client = get_bigquery_client()
agg = FoodSecurityAggregator(client) agg = FoodSecurityAggregator(client)
@@ -1392,13 +1240,8 @@ def run_aggregation():
print(f"Aggregation completed: {total:,} total rows loaded") print(f"Aggregation completed: {total:,} total rows loaded")
# =============================================================================
# MAIN EXECUTION
# =============================================================================
if __name__ == "__main__": if __name__ == "__main__":
import io import io
if _sys.stdout.encoding and _sys.stdout.encoding.lower() not in ("utf-8", "utf8"): if _sys.stdout.encoding and _sys.stdout.encoding.lower() not in ("utf-8", "utf8"):
_sys.stdout = io.TextIOWrapper(_sys.stdout.buffer, encoding="utf-8", errors="replace") _sys.stdout = io.TextIOWrapper(_sys.stdout.buffer, encoding="utf-8", errors="replace")
if _sys.stderr.encoding and _sys.stderr.encoding.lower() not in ("utf-8", "utf8"): if _sys.stderr.encoding and _sys.stderr.encoding.lower() not in ("utf-8", "utf8"):
@@ -1406,9 +1249,7 @@ if __name__ == "__main__":
print("=" * 70) print("=" * 70)
print("FOOD SECURITY AGGREGATION -> fs_asean_gold") print("FOOD SECURITY AGGREGATION -> fs_asean_gold")
print(f" Source : fact_asean_food_security_selected") print(f"Condition threshold: bad<{THRESHOLD_BAD}, moderate {THRESHOLD_BAD}-{THRESHOLD_GOOD}, good>{THRESHOLD_GOOD}")
print(f" Framework classification : dari kolom tabel (bukan heuristik)")
print(f" NORMALIZE_FRAMEWORKS_JOINTLY : {NORMALIZE_FRAMEWORKS_JOINTLY}")
print("=" * 70) print("=" * 70)
logger = setup_logging() logger = setup_logging()

View File

@@ -4,24 +4,31 @@ fact_asean_food_security_selected disimpan di fs_asean_gold (layer='gold')
Filtering Order: Filtering Order:
1. Load data (single years only) 1. Load data (single years only)
2. Determine year boundaries (2013 - auto-detected end year) 2. Determine year boundaries (2013 - auto-detected end year, baseline=2023 per syarat dosen)
3. Filter complete indicators PER COUNTRY (auto-detect start year, no gaps) 3. Filter complete indicators PER COUNTRY (auto-detect start year, no gaps)
4. Filter countries with ALL pillars (FIXED SET) 4. Filter countries with ALL pillars (FIXED SET)
5. Filter indicators with consistent presence across FIXED countries 5. Filter indicators with consistent presence across FIXED countries
6. Determine SDGs start year & assign framework (MDGs/SDGs) per indicator 6. Determine SDG start year & assign framework (MDGs/SDGs) per indicator
7. Calculate YoY per indicator per country 7. Verify no gaps
8. Analyze indicator availability by year 8. Calculate norm_value_1_100 per indicator per country (min-max, direction-aware)
9. Save analytical table (dengan nama/label lengkap + kolom framework + YoY untuk Looker Studio) 9. Calculate YoY per indicator per country
10. Analyze indicator availability by year
11. Save analytical table
NORMALISASI (Step 8):
- norm_value_1_100 = min-max normalisasi nilai raw per indikator, skala 1-100
- Direction-aware: lower_better diinvert sehingga nilai tinggi selalu = lebih baik
- Normalisasi dilakukan GLOBAL per indikator (semua negara, semua tahun sekaligus)
sehingga nilai antar negara dan antar tahun tetap comparable
- Kolom ini memungkinkan perbandingan antar indikator yang berbeda satuan di Looker Studio
FRAMEWORK LOGIC: FRAMEWORK LOGIC:
- SDG_START_YEAR = 2016 (default; auto-detect jika indikator SDGs pertama kali muncul lebih awal/lambat) - SDG start year dideteksi dari data: tahun pertama indikator FIES lengkap
di semua fixed countries (setelah Step 3-5 filter selesai)
- Indikator yang namanya ada di SDG_INDICATOR_KEYWORDS: - Indikator yang namanya ada di SDG_INDICATOR_KEYWORDS:
* Jika data mulai >= SDG_START_YEAR -> 'SDGs' * Jika actual_start_year >= sdg_start_year -> 'SDGs'
* Jika data mulai < SDG_START_YEAR -> 'MDGs' * Jika actual_start_year < sdg_start_year -> 'MDGs'
(artinya indikator ini sudah ada sebelum SDGs, mis. undernourishment)
- Indikator yang namanya TIDAK ada di SDG_INDICATOR_KEYWORDS -> 'MDGs' - Indikator yang namanya TIDAK ada di SDG_INDICATOR_KEYWORDS -> 'MDGs'
- Penentuan framework dilakukan SETELAH filter selesai (data sudah bersih & range sudah fixed)
sehingga start_year per indikator yang digunakan adalah start_year AKTUAL di dataset ini.
""" """
import pandas as pd import pandas as pd
@@ -50,15 +57,6 @@ from google.cloud import bigquery
# ============================================================================= # =============================================================================
# SDG INDICATOR KEYWORDS # SDG INDICATOR KEYWORDS
# ============================================================================= # =============================================================================
# Daftar nama indikator (lowercase) yang termasuk dalam SDG Goal 2.
# Matching dilakukan dengan `kw in indicator_name.lower()` sehingga
# partial match tetap valid (menangani variasi format nama).
#
# Logika framework:
# - Nama ada di set ini + start_year >= SDG_START_YEAR -> 'SDGs'
# - Nama ada di set ini + start_year < SDG_START_YEAR -> 'MDGs'
# (indikator sudah eksis sebelum SDGs, mis. prevalence of undernourishment)
# - Nama TIDAK ada di set ini -> 'MDGs'
SDG_INDICATOR_KEYWORDS = frozenset([ SDG_INDICATOR_KEYWORDS = frozenset([
# TARGET 2.1.1 — Prevalence of undernourishment (shared, sudah ada sebelum SDGs) # TARGET 2.1.1 — Prevalence of undernourishment (shared, sudah ada sebelum SDGs)
@@ -90,34 +88,55 @@ SDG_INDICATOR_KEYWORDS = frozenset([
"number of women of reproductive age (15-49 years) affected by anemia (million)", "number of women of reproductive age (15-49 years) affected by anemia (million)",
]) ])
# Tahun resmi SDGs mulai berlaku (2030 Agenda adopted September 2015, # Proxy keywords untuk deteksi era SDGs dari data (indikator murni baru di SDGs)
# data reporting mulai 2016). Dipakai sebagai default jika auto-detect gagal. _SDG_ERA_PROXY_KEYWORDS = frozenset([
SDG_START_YEAR_DEFAULT = 2016 "food insecurity",
"anemia",
"anaemia",
])
# =============================================================================
# THRESHOLD KONDISI (fixed absolute, skala 1-100)
# =============================================================================
# Digunakan untuk assign kondisi di analysis_layer.
# Didefinisikan di sini agar konsisten antara kedua file.
# bad : norm_value_1_100 < THRESHOLD_BAD
# good : norm_value_1_100 > THRESHOLD_GOOD
# moderate : di antara keduanya
THRESHOLD_BAD = 40.0
THRESHOLD_GOOD = 60.0
def assign_framework_dynamic( def assign_condition(norm_value_1_100: float) -> str:
"""
Assign kondisi berdasarkan norm_value_1_100 (skala 1-100, sudah direction-aware).
Nilai tinggi selalu berarti lebih baik (lower_better sudah diinvert).
Returns: 'good' / 'moderate' / 'bad'
"""
if pd.isna(norm_value_1_100):
return None
if norm_value_1_100 > THRESHOLD_GOOD:
return 'good'
if norm_value_1_100 < THRESHOLD_BAD:
return 'bad'
return 'moderate'
def assign_framework(
indicator_name: str, indicator_name: str,
indicator_start_year: int, actual_start_year: int,
sdg_start_year: int, sdg_start_year: int,
) -> str: ) -> str:
""" """
Tentukan framework (MDGs/SDGs) berdasarkan: Tentukan framework (MDGs/SDGs) per indikator.
1. Apakah nama indikator ada di SDG_INDICATOR_KEYWORDS? 'SDGs' jika nama ada di SDG_INDICATOR_KEYWORDS DAN actual_start_year >= sdg_start_year.
2. Apakah data indikator ini mulai pada tahun >= sdg_start_year? 'MDGs' untuk semua kasus lainnya.
Args:
indicator_name : Nama indikator (akan di-lowercase untuk matching)
indicator_start_year : Tahun pertama data indikator ini tersedia di dataset
sdg_start_year : Tahun mulai SDGs (dari auto-detect atau default)
Returns:
'SDGs' jika indikator termasuk SDG list DAN mulai >= sdg_start_year
'MDGs' untuk semua kasus lainnya
""" """
ind_lower = str(indicator_name).lower().strip() name_lower = str(indicator_name).lower().strip()
is_sdg_name = any(kw in ind_lower for kw in SDG_INDICATOR_KEYWORDS) in_sdg_list = name_lower in SDG_INDICATOR_KEYWORDS
if in_sdg_list and actual_start_year >= sdg_start_year:
if is_sdg_name and indicator_start_year >= sdg_start_year:
return 'SDGs' return 'SDGs'
return 'MDGs' return 'MDGs'
@@ -130,21 +149,12 @@ class AnalyticalLayerLoader:
""" """
Analytical Layer Loader for BigQuery Analytical Layer Loader for BigQuery
Key Logic: Output kolom fact_asean_food_security_selected:
1. Complete per country (no gaps from start_year to end_year)
2. Filter countries with all pillars
3. Ensure indicators have consistent country count across all years
4. Determine SDGs start year & assign framework per indicator dynamically
5. Calculate YoY (year-over-year) change per indicator per country
6. Save dengan kolom lengkap (nama + ID + framework + YoY) untuk Looker Studio
Output: fact_asean_food_security_selected -> DW layer (Gold) -> fs_asean_gold
Kolom output:
country_id, country_name, country_id, country_name,
indicator_id, indicator_name, direction, framework, indicator_id, indicator_name, direction, framework,
pillar_id, pillar_name, pillar_id, pillar_name,
time_id, year, value, time_id, year, value,
norm_value_1_100, <- NEWmin-max norm per indikator, skala 1-100, direction-aware
yoy_change, yoy_pct yoy_change, yoy_pct
""" """
@@ -162,10 +172,9 @@ class AnalyticalLayerLoader:
self.start_year = 2013 self.start_year = 2013
self.end_year = None self.end_year = None
self.baseline_year = 2023 self.baseline_year = 2023 # hardcode per syarat dosen (tahun terlengkap)
# SDGs-related — di-set oleh determine_sdg_start_year() self.sdg_start_year = None
self.sdg_start_year = SDG_START_YEAR_DEFAULT
self.pipeline_metadata = { self.pipeline_metadata = {
'source_class' : self.__class__.__name__, 'source_class' : self.__class__.__name__,
@@ -191,8 +200,6 @@ class AnalyticalLayerLoader:
self.logger.info("=" * 80) self.logger.info("=" * 80)
try: try:
# Tidak include framework dari dim_indicator —
# framework akan ditentukan dinamis di Step 6 (determine_sdg_start_year)
query = f""" query = f"""
SELECT SELECT
f.country_id, f.country_id,
@@ -224,12 +231,9 @@ class AnalyticalLayerLoader:
if 'is_year_range' in self.df_clean.columns: if 'is_year_range' in self.df_clean.columns:
yr = self.df_clean['is_year_range'].value_counts() yr = self.df_clean['is_year_range'].value_counts()
self.logger.info(f" Breakdown:")
self.logger.info( self.logger.info(
f" Single years (is_year_range=False): {yr.get(False, 0):,}" f" Single years: {yr.get(False, 0):,} | "
) f"Year ranges: {yr.get(True, 0):,}"
self.logger.info(
f" Year ranges (is_year_range=True): {yr.get(True, 0):,}"
) )
self.df_indicator = read_from_bigquery(self.client, 'dim_indicator', layer='gold') self.df_indicator = read_from_bigquery(self.client, 'dim_indicator', layer='gold')
@@ -256,15 +260,17 @@ class AnalyticalLayerLoader:
self.logger.info("STEP 2: DETERMINE YEAR BOUNDARIES") self.logger.info("STEP 2: DETERMINE YEAR BOUNDARIES")
self.logger.info("=" * 80) self.logger.info("=" * 80)
df_2023 = self.df_clean[self.df_clean['year'] == self.baseline_year] # baseline_year = 2023 hardcode (syarat dosen: minimal 2023)
baseline_indicator_count = df_2023['indicator_id'].nunique() df_baseline = self.df_clean[self.df_clean['year'] == self.baseline_year]
baseline_indicator_count = df_baseline['indicator_id'].nunique()
self.logger.info(f"\nBaseline Year: {self.baseline_year}") self.logger.info(f"\n Baseline year (hardcode, syarat dosen): {self.baseline_year}")
self.logger.info(f"Baseline Indicator Count: {baseline_indicator_count}") self.logger.info(f" Baseline indicator count: {baseline_indicator_count}")
years_sorted = sorted(self.df_clean['year'].unique(), reverse=True) years_sorted = sorted(self.df_clean['year'].unique(), reverse=True)
selected_end_year = None selected_end_year = None
self.logger.info(f"\n Scanning end_year (>= {self.baseline_year}):")
for year in years_sorted: for year in years_sorted:
if year >= self.baseline_year: if year >= self.baseline_year:
df_year = self.df_clean[self.df_clean['year'] == year] df_year = self.df_clean[self.df_clean['year'] == year]
@@ -276,9 +282,9 @@ class AnalyticalLayerLoader:
if selected_end_year is None: if selected_end_year is None:
selected_end_year = self.baseline_year selected_end_year = self.baseline_year
self.logger.warning(f" [!] No year found, using baseline: {selected_end_year}") self.logger.warning(f" [!] Fallback to baseline: {selected_end_year}")
else: else:
self.logger.info(f"\n [OK] Selected End Year: {selected_end_year}") self.logger.info(f"\n [OK] Selected end year: {selected_end_year}")
self.end_year = selected_end_year self.end_year = selected_end_year
original_count = len(self.df_clean) original_count = len(self.df_clean)
@@ -463,9 +469,7 @@ class AnalyticalLayerLoader:
else: else:
removed_indicators.append({ removed_indicators.append({
'indicator_name': indicator_name, 'indicator_name': indicator_name,
'reason' : ( 'reason' : f"missing countries in years: {', '.join(problematic_years[:5])}"
f"missing countries in years: {', '.join(problematic_years[:5])}"
)
}) })
self.logger.info(f"\n [+] Valid: {len(valid_indicators)}") self.logger.info(f"\n [+] Valid: {len(valid_indicators)}")
@@ -500,133 +504,86 @@ class AnalyticalLayerLoader:
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def determine_sdg_start_year(self): def determine_sdg_start_year(self):
"""
Tentukan tahun mulai SDGs secara otomatis dari data aktual, lalu
assign kolom 'framework' (MDGs/SDGs) ke setiap baris di df_clean.
Logika penentuan SDG_START_YEAR:
- Cari indikator yang namanya ada di SDG_INDICATOR_KEYWORDS (FIES, anaemia, dll.)
dan yang diyakini HANYA ada di SDGs (bukan shared dengan MDGs).
Proxy: indikator dengan keyword 'food insecurity' atau 'anemia'.
- Ambil tahun pertama (min year) dari indikator-indikator tersebut di dataset ini.
- Jika ditemukan -> sdg_start_year = tahun pertama itu.
- Jika tidak ditemukan -> sdg_start_year = SDG_START_YEAR_DEFAULT (2016).
Logika assign framework per indikator (assign_framework_dynamic):
- Nama ada di SDG_INDICATOR_KEYWORDS + start_year >= sdg_start_year -> 'SDGs'
- Nama ada di SDG_INDICATOR_KEYWORDS + start_year < sdg_start_year -> 'MDGs'
(indikator seperti undernourishment sudah ada sebelum SDGs)
- Nama TIDAK ada di SDG_INDICATOR_KEYWORDS -> 'MDGs'
"""
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 6: DETERMINE SDG START YEAR & ASSIGN FRAMEWORK") self.logger.info("STEP 6: DETERMINE SDG START YEAR & ASSIGN FRAMEWORK")
self.logger.info("=" * 80) self.logger.info("=" * 80)
# --- 6a. Auto-detect SDG start year dari data aktual --- # actual_start_year per indikator = max(min_year per country)
# Proxy SDGs-only: indikator yang pasti baru di SDGs (FIES & anaemia) # = konsisten dengan max_start_year di Step 5
sdg_proxy_keywords = [ indicator_actual_start = (
'food insecurity',
'anemia',
'anaemia',
]
sdg_proxy_mask = self.df_clean['indicator_name'].str.lower().apply(
lambda n: any(kw in n for kw in sdg_proxy_keywords)
)
df_sdg_proxy = self.df_clean[sdg_proxy_mask]
if len(df_sdg_proxy) > 0:
detected_start = int(df_sdg_proxy['year'].min())
self.sdg_start_year = detected_start
self.logger.info(
f"\n [OK] SDG start year AUTO-DETECTED dari data: {self.sdg_start_year}"
)
self.logger.info(f" Proxy indicators used (sample):")
proxy_sample = (
df_sdg_proxy['indicator_name']
.drop_duplicates()
.head(5)
.tolist()
)
for ind in proxy_sample:
self.logger.info(f" - {ind}")
else:
self.sdg_start_year = SDG_START_YEAR_DEFAULT
self.logger.warning(
f"\n [WARN] SDG proxy indicators not found in dataset. "
f"Using default: {self.sdg_start_year}"
)
self.logger.info(f"\n SDG_START_YEAR = {self.sdg_start_year}")
# --- 6b. Hitung start_year aktual per indikator di dataset ini ---
indicator_start = (
self.df_clean self.df_clean
.groupby(['indicator_id', 'indicator_name', 'country_id'])['year']
.min().reset_index()
.groupby(['indicator_id', 'indicator_name'])['year'] .groupby(['indicator_id', 'indicator_name'])['year']
.min() .max().reset_index()
.reset_index()
) )
indicator_start.columns = ['indicator_id', 'indicator_name', 'actual_start_year'] indicator_actual_start.columns = ['indicator_id', 'indicator_name', 'actual_start_year']
# --- 6c. Assign framework per indikator --- # Deteksi sdg_start_year dari proxy SDGs-only (FIES & anaemia)
indicator_start['framework'] = indicator_start.apply( proxy_mask = indicator_actual_start['indicator_name'].str.lower().apply(
lambda row: assign_framework_dynamic( lambda n: any(kw in n for kw in _SDG_ERA_PROXY_KEYWORDS)
)
df_proxy = indicator_actual_start[proxy_mask]
if df_proxy.empty:
raise ValueError(
"Tidak ada indikator proxy SDGs (FIES/anaemia) yang lolos filter. "
"Pastikan indikator FIES dan anaemia ada di data."
)
self.sdg_start_year = int(df_proxy['actual_start_year'].min())
self.logger.info(f"\n sdg_start_year = {self.sdg_start_year}")
self.logger.info(f" Proxy indicators:")
for _, row in df_proxy.iterrows():
self.logger.info(f" [{int(row['actual_start_year'])}] {row['indicator_name']}")
# Assign framework per indikator
indicator_actual_start['framework'] = indicator_actual_start.apply(
lambda row: assign_framework(
indicator_name = row['indicator_name'], indicator_name = row['indicator_name'],
indicator_start_year = int(row['actual_start_year']), actual_start_year = int(row['actual_start_year']),
sdg_start_year = self.sdg_start_year, sdg_start_year = self.sdg_start_year,
), ),
axis=1 axis=1
) )
# --- 6d. Log hasil assignment --- # Log hasil
self.logger.info(f"\n Framework assignment per indicator:") self.logger.info(f"\n Framework assignment:")
self.logger.info(f" {'-'*85}") self.logger.info(f" {'-'*80}")
self.logger.info( self.logger.info(f" {'ID':<5} {'Framework':<10} {'Start Yr':<10} {'Indicator Name'}")
f" {'ID':<5} {'Framework':<10} {'Start Yr':<10} {'Indicator Name'}" self.logger.info(f" {'-'*80}")
) for _, row in indicator_actual_start.sort_values(
self.logger.info(f" {'-'*85}")
for _, row in indicator_start.sort_values(
['framework', 'actual_start_year', 'indicator_name'] ['framework', 'actual_start_year', 'indicator_name']
).iterrows(): ).iterrows():
is_in_sdg_list = any(
kw in str(row['indicator_name']).lower()
for kw in SDG_INDICATOR_KEYWORDS
)
note = " [in SDG list]" if is_in_sdg_list else ""
self.logger.info( self.logger.info(
f" {int(row['indicator_id']):<5} {row['framework']:<10} " f" {int(row['indicator_id']):<5} {row['framework']:<10} "
f"{int(row['actual_start_year']):<10} {row['indicator_name'][:55]}{note}" f"{int(row['actual_start_year']):<10} {row['indicator_name'][:55]}"
) )
fw_summary = indicator_start['framework'].value_counts() fw_summary = indicator_actual_start['framework'].value_counts()
self.logger.info(f"\n Framework summary:") self.logger.info(f"\n Ringkasan: " + " | ".join(f"{fw}: {cnt}" for fw, cnt in fw_summary.items()))
for fw, cnt in fw_summary.items():
self.logger.info(f" {fw}: {cnt} indicators")
# --- 6e. Merge framework ke df_clean --- # Merge ke df_clean
self.df_clean = self.df_clean.merge( self.df_clean = self.df_clean.merge(
indicator_start[['indicator_id', 'framework']], indicator_actual_start[['indicator_id', 'framework']],
on='indicator_id', how='left' on='indicator_id', how='left'
) )
self.df_clean['framework'] = self.df_clean['framework'].fillna('MDGs') self.df_clean['framework'] = self.df_clean['framework'].fillna('MDGs')
self.logger.info(f"\n [OK] Kolom 'framework' ditambahkan ke df_clean")
self.logger.info( self.logger.info(
f" Row distribution — MDGs: " f"\n [OK] 'framework' ditambahkan — "
f"{(self.df_clean['framework'] == 'MDGs').sum():,} | " f"MDGs: {(self.df_clean['framework'] == 'MDGs').sum():,} rows | "
f"SDGs: {(self.df_clean['framework'] == 'SDGs').sum():,}" f"SDGs: {(self.df_clean['framework'] == 'SDGs').sum():,} rows"
) )
return self.df_clean return self.df_clean
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# STEP 6b: VERIFY NO GAPS # STEP 7: VERIFY NO GAPS
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def verify_no_gaps(self): def verify_no_gaps(self):
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 6c: VERIFY NO GAPS") self.logger.info("STEP 7: VERIFY NO GAPS")
self.logger.info("=" * 80) self.logger.info("=" * 80)
expected_countries = len(self.selected_country_ids) expected_countries = len(self.selected_country_ids)
@@ -652,21 +609,110 @@ class AnalyticalLayerLoader:
return True return True
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# STEP 7: CALCULATE YOY # STEP 8: CALCULATE NORM_VALUE_1_100 PER INDICATOR PER COUNTRY
# ------------------------------------------------------------------
def calculate_norm_value(self):
"""
Hitung norm_value_1_100 per indikator — min-max normalisasi skala 1-100,
direction-aware.
CARA KERJA:
- Normalisasi dilakukan GLOBAL per indikator (semua negara + semua tahun sekaligus)
sehingga nilai antar negara dan antar tahun tetap comparable.
- lower_better diinvert: nilai tinggi selalu = kondisi lebih baik.
Contoh: undernourishment 5% (rendah = baik) → norm tinggi setelah invert.
- Skala 1-100 (bukan 0-100) untuk menghindari nilai absolut nol di Looker Studio.
- Kolom ini memungkinkan perbandingan lintas indikator yang berbeda satuan
(persen, juta orang, dll) karena sudah dinormalisasi ke skala yang sama.
Catatan:
- Berbeda dengan norm_value di _get_norm_value_df() di analysis_layer
yang skala 0-1 dan dipakai untuk agregasi composite score.
- norm_value_1_100 ini adalah per baris (per country per year per indicator),
untuk ditampilkan langsung di Looker Studio.
"""
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 8: CALCULATE NORM_VALUE_1_100 PER INDICATOR")
self.logger.info("=" * 80)
DIRECTION_INVERT = frozenset({
"negative", "lower_better", "lower_is_better", "inverse", "neg",
})
df = self.df_clean.copy()
norm_parts = []
indicators = df.groupby(['indicator_id', 'indicator_name', 'direction'])
self.logger.info(f"\n {'ID':<5} {'Direction':<15} {'Invert':<8} {'Min':>10} {'Max':>10} {'Indicator Name'}")
self.logger.info(f" {'-'*90}")
for (ind_id, ind_name, direction), grp in indicators:
grp = grp.copy()
do_invert = str(direction).lower().strip() in DIRECTION_INVERT
valid_mask = grp['value'].notna()
n_valid = valid_mask.sum()
if n_valid < 2:
grp['norm_value_1_100'] = np.nan
norm_parts.append(grp)
continue
raw = grp.loc[valid_mask, 'value'].values
v_min = raw.min()
v_max = raw.max()
normed = np.full(len(grp), np.nan)
if v_min == v_max:
# Semua nilai sama → beri nilai tengah (50.5 pada skala 1-100)
normed[valid_mask.values] = 50.5
else:
# Min-max ke 0-1 dulu
scaled = (raw - v_min) / (v_max - v_min)
# Invert jika lower_better
if do_invert:
scaled = 1.0 - scaled
# Scale ke 1-100
normed[valid_mask.values] = 1.0 + scaled * 99.0
grp['norm_value_1_100'] = normed
self.logger.info(
f" {int(ind_id):<5} {direction:<15} {'YES' if do_invert else 'no':<8} "
f"{v_min:>10.3f} {v_max:>10.3f} {ind_name[:45]}"
)
norm_parts.append(grp)
self.df_clean = pd.concat(norm_parts, ignore_index=True)
# Statistik ringkasan
valid_norm = self.df_clean['norm_value_1_100'].notna().sum()
null_norm = self.df_clean['norm_value_1_100'].isna().sum()
self.logger.info(f"\n norm_value_1_100 — valid: {valid_norm:,} | null: {null_norm:,}")
self.logger.info(
f" Range aktual: "
f"{self.df_clean['norm_value_1_100'].min():.2f} - "
f"{self.df_clean['norm_value_1_100'].max():.2f}"
)
# Log distribusi kondisi berdasarkan threshold
self.df_clean['_condition_preview'] = self.df_clean['norm_value_1_100'].apply(assign_condition)
cond_dist = self.df_clean['_condition_preview'].value_counts()
self.logger.info(f"\n Distribusi kondisi (threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}):")
for cond, cnt in cond_dist.items():
self.logger.info(f" {cond}: {cnt:,} rows")
self.df_clean = self.df_clean.drop(columns=['_condition_preview'])
self.logger.info(f"\n [OK] Kolom 'norm_value_1_100' ditambahkan ke df_clean")
return self.df_clean
# ------------------------------------------------------------------
# STEP 9: CALCULATE YOY
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def calculate_yoy(self): def calculate_yoy(self):
"""
Hitung Year-over-Year (YoY) per indikator per negara.
Kolom yang ditambahkan:
yoy_change : selisih absolut -> value - value_tahun_sebelumnya
yoy_pct : perubahan relatif -> (yoy_change / abs(value_prev)) * 100
Baris tahun pertama per kombinasi country-indicator bernilai NULL (intentional).
"""
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 7: CALCULATE YEAR-OVER-YEAR (YoY) PER INDICATOR PER COUNTRY") self.logger.info("STEP 9: CALCULATE YEAR-OVER-YEAR (YoY) PER INDICATOR PER COUNTRY")
self.logger.info("=" * 80) self.logger.info("=" * 80)
df = self.df_clean.sort_values(['country_id', 'indicator_id', 'year']).copy() df = self.df_clean.sort_values(['country_id', 'indicator_id', 'year']).copy()
@@ -686,62 +732,19 @@ class AnalyticalLayerLoader:
self.logger.info(f" Total rows : {total_rows:,}") self.logger.info(f" Total rows : {total_rows:,}")
self.logger.info(f" YoY calculated : {valid_yoy:,}") self.logger.info(f" YoY calculated : {valid_yoy:,}")
self.logger.info(f" YoY NULL (base yr): {null_yoy:,} <- tahun pertama per country-indicator") self.logger.info(f" YoY NULL (base yr): {null_yoy:,}")
per_ind = (
df[df['yoy_pct'].notna()]
.groupby(['indicator_id', 'indicator_name'])['yoy_pct']
.agg(['mean', 'std', 'min', 'max'])
.reset_index()
)
per_ind.columns = ['indicator_id', 'indicator_name', 'mean', 'std', 'min', 'max']
self.logger.info(f"\n YoY summary per indicator (top 10 by abs mean change):")
self.logger.info(f" {'-'*100}")
self.logger.info(
f" {'ID':<5} {'Indicator Name':<52} {'Mean%':>8} {'Std%':>8} {'Min%':>8} {'Max%':>8}"
)
self.logger.info(f" {'-'*100}")
top_ind = per_ind.reindex(
per_ind['mean'].abs().sort_values(ascending=False).index
).head(10)
for _, row in top_ind.iterrows():
self.logger.info(
f" {int(row['indicator_id']):<5} {row['indicator_name'][:50]:<52} "
f"{row['mean']:>+8.2f} {row['std']:>8.2f} "
f"{row['min']:>+8.2f} {row['max']:>+8.2f}"
)
per_country = (
df[df['yoy_pct'].notna()]
.groupby(['country_id', 'country_name'])['yoy_pct']
.agg(['mean', 'std'])
.reset_index()
)
per_country.columns = ['country_id', 'country_name', 'mean_yoy', 'std_yoy']
self.logger.info(f"\n YoY summary per country:")
self.logger.info(f" {'-'*60}")
self.logger.info(f" {'Country':<30} {'Mean YoY%':>10} {'Std YoY%':>10}")
self.logger.info(f" {'-'*60}")
for _, row in per_country.sort_values('mean_yoy', ascending=False).iterrows():
self.logger.info(
f" {row['country_name']:<30} {row['mean_yoy']:>+10.2f} {row['std_yoy']:>10.2f}"
)
self.df_clean = df self.df_clean = df
self.logger.info(f"\n [OK] YoY columns added: yoy_change, yoy_pct") self.logger.info(f" [OK] Kolom 'yoy_change', 'yoy_pct' ditambahkan")
return self.df_clean return self.df_clean
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# STEP 8: ANALYZE INDICATOR AVAILABILITY BY YEAR # STEP 10: ANALYZE INDICATOR AVAILABILITY BY YEAR
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def analyze_indicator_availability_by_year(self): def analyze_indicator_availability_by_year(self):
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 8: ANALYZE INDICATOR AVAILABILITY BY YEAR") self.logger.info("STEP 10: ANALYZE INDICATOR AVAILABILITY BY YEAR")
self.logger.info("=" * 80) self.logger.info("=" * 80)
year_stats = self.df_clean.groupby('year').agg({ year_stats = self.df_clean.groupby('year').agg({
@@ -776,10 +779,7 @@ class AnalyticalLayerLoader:
) )
self.logger.info(f"\nTotal Indicators: {len(indicator_details)}") self.logger.info(f"\nTotal Indicators: {len(indicator_details)}")
for pillar, count in indicator_details.groupby('pillar_name').size().items(): self.logger.info(f"Framework breakdown:")
self.logger.info(f" {pillar}: {count} indicators")
self.logger.info(f"\nFramework breakdown:")
for fw, count in indicator_details.groupby('framework').size().items(): for fw, count in indicator_details.groupby('framework').size().items():
self.logger.info(f" {fw}: {count} indicators") self.logger.info(f" {fw}: {count} indicators")
@@ -800,37 +800,23 @@ class AnalyticalLayerLoader:
return year_stats return year_stats
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# STEP 9: SAVE ANALYTICAL TABLE # STEP 11: SAVE ANALYTICAL TABLE
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def save_analytical_table(self): def save_analytical_table(self):
"""
Simpan fact_asean_food_security_selected ke Gold layer.
Kolom yang disimpan:
country_id, country_name — dimensi negara
indicator_id, indicator_name — dimensi indikator
direction — arah penilaian (higher/lower_better)
framework — MDGs/SDGs (ditentukan di Step 6)
pillar_id, pillar_name — dimensi pilar
time_id, year — dimensi waktu
value — nilai indikator
yoy_change — perubahan absolut YoY (NULL di tahun pertama)
yoy_pct — perubahan relatif YoY dalam % (NULL di tahun pertama)
"""
table_name = 'fact_asean_food_security_selected' table_name = 'fact_asean_food_security_selected'
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info(f"STEP 9: SAVE TO [DW/Gold] {table_name} -> fs_asean_gold") self.logger.info(f"STEP 11: SAVE TO [DW/Gold] {table_name} -> fs_asean_gold")
self.logger.info("=" * 80) self.logger.info("=" * 80)
try: try:
# Pastikan kolom YoY tersedia — fallback jika calculate_yoy() tidak dipanggil if 'framework' not in self.df_clean.columns:
if 'yoy_change' not in self.df_clean.columns or 'yoy_pct' not in self.df_clean.columns: raise ValueError("Kolom 'framework' tidak ada. Pastikan Step 6 sudah dijalankan.")
self.logger.warning( if 'norm_value_1_100' not in self.df_clean.columns:
" [WARN] Kolom YoY tidak ditemukan. Menjalankan calculate_yoy() sebagai fallback..." raise ValueError("Kolom 'norm_value_1_100' tidak ada. Pastikan Step 8 sudah dijalankan.")
) if 'yoy_change' not in self.df_clean.columns:
self.calculate_yoy() raise ValueError("Kolom 'yoy_change' tidak ada. Pastikan Step 9 sudah dijalankan.")
analytical_df = self.df_clean[[ analytical_df = self.df_clean[[
'country_id', 'country_id',
@@ -844,6 +830,7 @@ class AnalyticalLayerLoader:
'time_id', 'time_id',
'year', 'year',
'value', 'value',
'norm_value_1_100',
'yoy_change', 'yoy_change',
'yoy_pct', 'yoy_pct',
]].copy() ]].copy()
@@ -863,21 +850,22 @@ class AnalyticalLayerLoader:
analytical_df['time_id'] = analytical_df['time_id'].astype(int) analytical_df['time_id'] = analytical_df['time_id'].astype(int)
analytical_df['year'] = analytical_df['year'].astype(int) analytical_df['year'] = analytical_df['year'].astype(int)
analytical_df['value'] = analytical_df['value'].astype(float) analytical_df['value'] = analytical_df['value'].astype(float)
analytical_df['norm_value_1_100'] = analytical_df['norm_value_1_100'].astype(float)
analytical_df['yoy_change'] = analytical_df['yoy_change'].astype(float) analytical_df['yoy_change'] = analytical_df['yoy_change'].astype(float)
analytical_df['yoy_pct'] = analytical_df['yoy_pct'].astype(float) analytical_df['yoy_pct'] = analytical_df['yoy_pct'].astype(float)
self.logger.info(f" Kolom yang disimpan: {list(analytical_df.columns)}")
self.logger.info(f" Total rows: {len(analytical_df):,}") self.logger.info(f" Total rows: {len(analytical_df):,}")
fw_dist = analytical_df.drop_duplicates('indicator_id')['framework'].value_counts() fw_dist = analytical_df.drop_duplicates('indicator_id')['framework'].value_counts()
self.logger.info(f" Framework distribution (per indikator unik):") self.logger.info(f" Framework distribution:")
for fw, cnt in fw_dist.items(): for fw, cnt in fw_dist.items():
self.logger.info(f" {fw}: {cnt} indicators") self.logger.info(f" {fw}: {cnt} indicators")
yoy_valid = analytical_df['yoy_pct'].notna().sum() self.logger.info(
yoy_null = analytical_df['yoy_pct'].isna().sum() f" norm_value_1_100 range: "
self.logger.info(f" YoY rows (calculated): {yoy_valid:,}") f"{analytical_df['norm_value_1_100'].min():.2f} - "
self.logger.info(f" YoY rows (NULL/base) : {yoy_null:,}") f"{analytical_df['norm_value_1_100'].max():.2f}"
)
schema = [ schema = [
bigquery.SchemaField("country_id", "INTEGER", mode="REQUIRED"), bigquery.SchemaField("country_id", "INTEGER", mode="REQUIRED"),
@@ -891,6 +879,7 @@ class AnalyticalLayerLoader:
bigquery.SchemaField("time_id", "INTEGER", mode="REQUIRED"), bigquery.SchemaField("time_id", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("year", "INTEGER", mode="REQUIRED"), bigquery.SchemaField("year", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("value", "FLOAT", mode="REQUIRED"), bigquery.SchemaField("value", "FLOAT", mode="REQUIRED"),
bigquery.SchemaField("norm_value_1_100", "FLOAT", mode="NULLABLE"),
bigquery.SchemaField("yoy_change", "FLOAT", mode="NULLABLE"), bigquery.SchemaField("yoy_change", "FLOAT", mode="NULLABLE"),
bigquery.SchemaField("yoy_pct", "FLOAT", mode="NULLABLE"), bigquery.SchemaField("yoy_pct", "FLOAT", mode="NULLABLE"),
] ]
@@ -915,30 +904,26 @@ class AnalyticalLayerLoader:
'config_snapshot' : json.dumps({ 'config_snapshot' : json.dumps({
'start_year' : self.start_year, 'start_year' : self.start_year,
'end_year' : self.end_year, 'end_year' : self.end_year,
'baseline_year' : self.baseline_year,
'sdg_start_year' : self.sdg_start_year, 'sdg_start_year' : self.sdg_start_year,
'fixed_countries' : len(self.selected_country_ids), 'fixed_countries' : len(self.selected_country_ids),
'no_gaps' : True, 'norm_scale' : '1-100 per indicator global minmax direction-aware',
'layer' : 'gold', 'condition_thresholds': {
'framework_logic' : ( 'bad' : f'< {THRESHOLD_BAD}',
f"SDGs if in SDG_INDICATOR_KEYWORDS AND start_year >= {self.sdg_start_year}, " 'moderate': f'{THRESHOLD_BAD}-{THRESHOLD_GOOD}',
"else MDGs" 'good' : f'> {THRESHOLD_GOOD}',
), },
}), }),
'validation_metrics' : json.dumps({ 'validation_metrics' : json.dumps({
'fixed_countries' : len(self.selected_country_ids), 'fixed_countries' : len(self.selected_country_ids),
'total_indicators': int(self.df_clean['indicator_id'].nunique()), 'total_indicators': int(self.df_clean['indicator_id'].nunique()),
'sdg_start_year' : self.sdg_start_year, 'sdg_start_year' : self.sdg_start_year,
'framework_dist' : fw_dist.to_dict(), 'framework_dist' : fw_dist.to_dict(),
'yoy_rows_valid' : int(yoy_valid),
'yoy_rows_null' : int(yoy_null),
}) })
} }
save_etl_metadata(self.client, metadata) save_etl_metadata(self.client, metadata)
self.logger.info( self.logger.info(f" [OK] {table_name}: {rows_loaded:,} rows -> fs_asean_gold")
f" {table_name}: {rows_loaded:,} rows -> [DW/Gold] fs_asean_gold"
)
self.logger.info(f" Metadata -> [AUDIT] etl_metadata")
return rows_loaded return rows_loaded
except Exception as e: except Exception as e:
@@ -955,9 +940,8 @@ class AnalyticalLayerLoader:
self.logger.info("\n" + "=" * 80) self.logger.info("\n" + "=" * 80)
self.logger.info("Output: fact_asean_food_security_selected -> fs_asean_gold") self.logger.info("Output: fact_asean_food_security_selected -> fs_asean_gold")
self.logger.info("Kolom: country_id/name, indicator_id/name, direction, framework,") self.logger.info("Kolom baru: norm_value_1_100 (min-max 1-100, direction-aware)")
self.logger.info(" pillar_id/name, time_id, year, value, yoy_change, yoy_pct") self.logger.info(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
self.logger.info(f"Framework: ditentukan dinamis berdasarkan SDG_START_YEAR (auto-detect)")
self.logger.info("=" * 80) self.logger.info("=" * 80)
self.load_source_data() self.load_source_data()
@@ -965,9 +949,10 @@ class AnalyticalLayerLoader:
self.filter_complete_indicators_per_country() self.filter_complete_indicators_per_country()
self.select_countries_with_all_pillars() self.select_countries_with_all_pillars()
self.filter_indicators_consistent_across_fixed_countries() self.filter_indicators_consistent_across_fixed_countries()
self.determine_sdg_start_year() # Step 6: auto-detect SDG year & assign framework self.determine_sdg_start_year()
self.verify_no_gaps() # Step 6c: verifikasi tidak ada gap self.verify_no_gaps()
self.calculate_yoy() # Step 7: hitung YoY self.calculate_norm_value() # Step 8: norm_value_1_100
self.calculate_yoy() # Step 9: yoy_change, yoy_pct
self.analyze_indicator_availability_by_year() self.analyze_indicator_availability_by_year()
self.save_analytical_table() self.save_analytical_table()
@@ -990,10 +975,6 @@ class AnalyticalLayerLoader:
# ============================================================================= # =============================================================================
def run_analytical_layer(): def run_analytical_layer():
"""
Airflow task: Build fact_asean_food_security_selected dari fact_food_security + dims.
Dipanggil setelah dimensional_model_to_gold selesai.
"""
from scripts.bigquery_config import get_bigquery_client from scripts.bigquery_config import get_bigquery_client
client = get_bigquery_client() client = get_bigquery_client()
loader = AnalyticalLayerLoader(client) loader = AnalyticalLayerLoader(client)
@@ -1009,7 +990,8 @@ if __name__ == "__main__":
print("=" * 80) print("=" * 80)
print("BIGQUERY ANALYTICAL LAYER - DATA FILTERING") print("BIGQUERY ANALYTICAL LAYER - DATA FILTERING")
print("Output: fact_asean_food_security_selected -> fs_asean_gold") print("Output: fact_asean_food_security_selected -> fs_asean_gold")
print("Framework: MDGs/SDGs ditentukan dinamis dari data (auto-detect SDG start year)") print(f"Norm: min-max 1-100 per indicator, direction-aware")
print(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
print("=" * 80) print("=" * 80)
logger = setup_logging() logger = setup_logging()