diff --git a/scripts/bigquery_analytical_layer.py b/scripts/bigquery_analytical_layer.py index 9cecb19..d96f033 100644 --- a/scripts/bigquery_analytical_layer.py +++ b/scripts/bigquery_analytical_layer.py @@ -12,10 +12,11 @@ Filtering Order: → Semua baris tetap ada; label framework ditentukan di Step 6 6. Assign framework (MDGs/SDGs) per indicator PER ROW → Indikator TIDAK di SDG_ONLY_KEYWORDS → 'MDGs' selalu - → Indikator DI SDG_ONLY_KEYWORDS + year >= sdg_transition_year → 'SDGs' - → Indikator DI SDG_ONLY_KEYWORDS + year < sdg_transition_year → 'MDGs' - → sdg_transition_year = min(actual_start_year) dari semua SDG-only indicators - yang lolos filter (= tahun pertama data SDG-only konsisten di semua countries) + → Indikator DI SDG_ONLY_KEYWORDS + year >= SDG_TRANSITION_YEAR → 'SDGs' + → Indikator DI SDG_ONLY_KEYWORDS + year < SDG_TRANSITION_YEAR → 'MDGs' + → SDG_TRANSITION_YEAR = 2016 (HARDCODE — tanggal resmi SDGs berlaku) + BUKAN dari actual_start_year data, karena data anaemia/FIES bisa ada + sebelum 2016 namun tetap harus dilabeli MDGs pada tahun-tahun tersebut. 7. Verify no gaps (dari actual_start_year per indikator, bukan start_year global) 8. Calculate norm_value_1_100 per indicator (min-max, direction-aware, global) 9. Calculate YoY per indicator per country @@ -23,12 +24,19 @@ Filtering Order: 11. Save analytical table FRAMEWORK LOGIC: -- sdg_transition_year dihitung SATU KALI dari actual_start_year SDG-only indicators -- Semua SDG-only indicators menggunakan sdg_transition_year yang SAMA +- SDG_TRANSITION_YEAR = 2016 (HARDCODE, bukan auto-detect dari data) +- Semua SDG-only indicators menggunakan SDG_TRANSITION_YEAR yang SAMA sehingga label berubah serentak di satu titik waktu -- Baris sebelum sdg_transition_year → 'MDGs' (data tetap ada, tidak dihapus) -- Baris mulai sdg_transition_year → 'SDGs' -- Indikator non-SDG-only → 'MDGs' selalu +- SDG-only + year < SDG_TRANSITION_YEAR → 'MDGs' (data tetap ada, tidak dihapus) +- SDG-only + year >= SDG_TRANSITION_YEAR → 'SDGs' +- Non-SDG-only indicators → 'MDGs' selalu (di semua tahun) + +ALASAN HARDCODE: +- SDGs resmi diadopsi PBB pada 25 September 2015 dan mulai berlaku 1 Januari 2016 +- Indikator FIES dan anaemia punya data sebelum 2016 (dari MDGs era) +- Jika sdg_transition_year di-auto-detect dari min(actual_start_year), + maka akan = 2013 (karena data ada sejak 2013), sehingga semua tahun + berlabel SDGs — yang secara historis tidak tepat. """ import pandas as pd @@ -58,10 +66,13 @@ from google.cloud import bigquery # SDG-ONLY INDICATOR KEYWORDS # ============================================================================= # Hanya indikator yang MURNI BARU di era SDGs yang didaftarkan di sini. -# Indikator di set ini → 'SDGs' mulai dari sdg_transition_year. +# Indikator di set ini → 'SDGs' mulai dari SDG_TRANSITION_YEAR (2016). # Semua indikator lain (shared maupun tidak dikenal) → 'MDGs' di semua tahun. SDG_ONLY_KEYWORDS = frozenset([ + # TARGET 2.1.1 + "prevalence of undernourishment (percent) (3-year average)", + "number of people undernourished (million) (3-year average)", # TARGET 2.1.2 — FIES (SDGs only) "prevalence of severe food insecurity in the total population (percent) (3-year average)", "prevalence of severe food insecurity in the male adult population (percent) (3-year average)", @@ -80,6 +91,15 @@ SDG_ONLY_KEYWORDS = frozenset([ "number of women of reproductive age (15-49 years) affected by anemia (million)", ]) +# ============================================================================= +# SDG TRANSITION YEAR — HARDCODE +# ============================================================================= +# SDGs resmi berlaku mulai 1 Januari 2016 (diadopsi PBB 25 September 2015). +# Nilai ini TIDAK boleh dihitung dari data karena indikator FIES/anaemia +# punya data historis sebelum 2016 yang harus tetap dilabeli 'MDGs'. + +SDG_TRANSITION_YEAR = 2016 + # ============================================================================= # THRESHOLD KONDISI (fixed absolute, skala 1-100) # ============================================================================= @@ -119,13 +139,11 @@ class AnalyticalLayerLoader: yoy_change, yoy_pct FRAMEWORK LOGIC: + - SDG_TRANSITION_YEAR = 2016 (HARDCODE — tanggal resmi SDGs berlaku) - Indikator TIDAK di SDG_ONLY_KEYWORDS → 'MDGs' di SEMUA tahun - Indikator DI SDG_ONLY_KEYWORDS: - year < sdg_transition_year → 'MDGs' (data tetap ada, tidak dihapus) - year >= sdg_transition_year → 'SDGs' - - sdg_transition_year = min(actual_start_year) dari semua SDG-only indicators - yang lolos filter Step 3-5. Semua SDG-only indicators menggunakan - sdg_transition_year yang SAMA agar label berubah serentak. + year < SDG_TRANSITION_YEAR (2016) → 'MDGs' (data tetap ada, tidak dihapus) + year >= SDG_TRANSITION_YEAR (2016) → 'SDGs' """ def __init__(self, client: bigquery.Client): @@ -140,12 +158,14 @@ class AnalyticalLayerLoader: self.selected_country_ids = None self.indicator_max_start_map = {} # indicator_id → max_start_year (dari Step 5) - self.sdg_transition_year = None # tahun SDGs mulai berlaku (dari Step 6) self.start_year = 2013 self.end_year = None self.baseline_year = 2023 + # SDG_TRANSITION_YEAR diambil dari konstanta modul (HARDCODE = 2016) + self.sdg_transition_year = SDG_TRANSITION_YEAR + self.pipeline_metadata = { 'source_class' : self.__class__.__name__, 'start_time' : None, @@ -455,14 +475,14 @@ class AnalyticalLayerLoader: # Filter hanya indikator yang valid. # PENTING: TIDAK menghapus baris year < max_start_year. # Semua baris tetap ada — label framework ditentukan di Step 6. - # max_start_year disimpan sebagai lookup untuk Step 6 & 7. + # max_start_year disimpan sebagai lookup untuk Step 7. # ---------------------------------------------------------------- original_count = len(self.df_clean) self.df_clean = self.df_clean[ self.df_clean['indicator_id'].isin(valid_indicators) ].copy() - # Simpan max_start_year per indicator_id untuk Step 6 dan Step 7 + # Simpan max_start_year per indicator_id untuk Step 7 self.indicator_max_start_map = ( indicator_max_start[indicator_max_start['indicator_id'].isin(valid_indicators)] .set_index('indicator_id')['max_start_year'] @@ -484,86 +504,79 @@ class AnalyticalLayerLoader: # STEP 6: ASSIGN FRAMEWORK PER ROW # ------------------------------------------------------------------ - def determine_sdg_start_year(self): + def assign_framework(self): self.logger.info("\n" + "=" * 80) self.logger.info("STEP 6: ASSIGN FRAMEWORK PER ROW") self.logger.info("=" * 80) # ---------------------------------------------------------------- - # Bangun tabel actual_start_year per indikator dari - # indicator_max_start_map yang sudah ditetapkan di Step 5. + # SDG_TRANSITION_YEAR = 2016 (HARDCODE) + # SDGs diadopsi PBB 25 September 2015, berlaku 1 Januari 2016. + # + # PENTING — TIDAK dihitung dari data: + # Jika auto-detect dari min(actual_start_year SDG-only indicators), + # hasilnya = 2013 (karena data FIES/anaemia ada sejak 2013). + # Akibatnya year >= 2013 → SDGs → SEMUA tahun berlabel SDGs. + # Ini secara historis salah karena SDGs belum berlaku di 2013-2015. # ---------------------------------------------------------------- - indicator_actual_start = pd.DataFrame([ - {'indicator_id': ind_id, 'actual_start_year': int(start_yr)} - for ind_id, start_yr in self.indicator_max_start_map.items() - ]) + self.logger.info(f"\n SDG_TRANSITION_YEAR : {self.sdg_transition_year} (HARDCODE)") + self.logger.info(f" Alasan : SDGs resmi berlaku 1 Januari 2016") + self.logger.info(f" Bukan auto-detect : data FIES/anaemia ada sejak 2013,") + self.logger.info(f" tapi tahun 2013-2015 harus tetap MDGs") - # Merge indicator_name untuk logging - indicator_actual_start = indicator_actual_start.merge( - self.df_clean[['indicator_id', 'indicator_name']].drop_duplicates(), - on='indicator_id', how='left' + # ---------------------------------------------------------------- + # Identifikasi indikator SDG-only berdasarkan SDG_ONLY_KEYWORDS + # ---------------------------------------------------------------- + indicator_info = ( + self.df_clean[['indicator_id', 'indicator_name']] + .drop_duplicates() + .copy() ) - - # Tandai mana yang SDG-only - indicator_actual_start['is_sdg_only'] = ( - indicator_actual_start['indicator_name'] - .str.lower().str.strip() + indicator_info['is_sdg_only'] = ( + indicator_info['indicator_name'] + .str.lower() + .str.strip() .isin(SDG_ONLY_KEYWORDS) ) + sdg_only_ids = set( + indicator_info.loc[indicator_info['is_sdg_only'], 'indicator_id'] + ) + non_sdg_ids = set( + indicator_info.loc[~indicator_info['is_sdg_only'], 'indicator_id'] + ) + + self.logger.info(f"\n SDG-only indicators ({len(sdg_only_ids)}):") + for _, row in indicator_info[indicator_info['is_sdg_only']].iterrows(): + actual_start = self.indicator_max_start_map.get(row['indicator_id'], '?') + self.logger.info( + f" [SDG-only] id={int(row['indicator_id'])} " + f"actual_start={actual_start} | {row['indicator_name']}" + ) + + self.logger.info(f"\n Non-SDG-only indicators ({len(non_sdg_ids)}): → MDGs selalu") + # ---------------------------------------------------------------- - # sdg_transition_year = min(actual_start_year) dari semua SDG-only - # indicators yang lolos filter. - # Ini adalah satu titik waktu di mana semua SDG-only indicators - # berubah dari 'MDGs' ke 'SDGs' secara SERENTAK. + # Validasi: pastikan ada SDG-only indicators yang lolos filter # ---------------------------------------------------------------- - sdg_only_df = indicator_actual_start[indicator_actual_start['is_sdg_only']] - if sdg_only_df.empty: + if not sdg_only_ids: raise ValueError( "Tidak ada indikator SDG-only (FIES/anaemia) yang lolos filter. " - "Pastikan indikator FIES dan anaemia ada di data." + "Pastikan nama indikator di SDG_ONLY_KEYWORDS cocok dengan data BigQuery." ) - self.sdg_transition_year = int(sdg_only_df['actual_start_year'].min()) - - self.logger.info(f"\n SDG-only indicators dan actual_start_year masing-masing:") - self.logger.info(f" {'-'*80}") - for _, row in sdg_only_df.iterrows(): - self.logger.info( - f" [SDG-only] actual_start={int(row['actual_start_year'])} | " - f"{row['indicator_name']}" - ) - - self.logger.info( - f"\n sdg_transition_year = {self.sdg_transition_year} " - f"(min actual_start_year dari semua SDG-only indicators)" - ) - - self.logger.info(f"\n Logika assign framework (PER BARIS):") - self.logger.info(f" ──────────────────────────────────────────────────────────") - self.logger.info(f" Indikator TIDAK di SDG_ONLY_KEYWORDS:") - self.logger.info(f" → 'MDGs' di semua tahun") - self.logger.info(f" Indikator DI SDG_ONLY_KEYWORDS:") - self.logger.info(f" year < {self.sdg_transition_year} → 'MDGs' (data tetap ada)") - self.logger.info(f" year >= {self.sdg_transition_year} → 'SDGs'") - self.logger.info(f" ──────────────────────────────────────────────────────────") - # ---------------------------------------------------------------- - # Assign framework dengan vectorized operation menggunakan - # sdg_transition_year (SATU nilai untuk semua SDG-only indicators) + # Assign framework dengan vectorized np.where: + # + # Kondisi SDG-only AND year >= SDG_TRANSITION_YEAR → 'SDGs' + # Semua kondisi lain (non-SDG-only ATAU year < SDG_TRANSITION_YEAR) → 'MDGs' + # + # Hasilnya dalam 1 indikator SDG-only (misal anaemia, data mulai 2013): + # 2013, 2014, 2015 → 'MDGs' (data tetap ada) + # 2016, 2017, ... → 'SDGs' # ---------------------------------------------------------------- - # Tandai apakah setiap baris adalah SDG-only indicator - sdg_only_ids = set( - indicator_actual_start.loc[ - indicator_actual_start['is_sdg_only'], 'indicator_id' - ] - ) self.df_clean['_is_sdg_only'] = self.df_clean['indicator_id'].isin(sdg_only_ids) - # Assign framework: - # - Bukan SDG-only → 'MDGs' - # - SDG-only AND year >= sdg_transition_year → 'SDGs' - # - SDG-only AND year < sdg_transition_year → 'MDGs' self.df_clean['framework'] = np.where( self.df_clean['_is_sdg_only'] & (self.df_clean['year'] >= self.sdg_transition_year), @@ -571,19 +584,26 @@ class AnalyticalLayerLoader: 'MDGs' ) - # Drop kolom bantu self.df_clean = self.df_clean.drop(columns=['_is_sdg_only']) # ---------------------------------------------------------------- - # Log verifikasi per indikator + # Log verifikasi per indikator — tampilkan split MDGs/SDGs per tahun # ---------------------------------------------------------------- + self.logger.info(f"\n Logika assign framework (PER BARIS):") + self.logger.info(f" {'─'*72}") + self.logger.info(f" Indikator TIDAK di SDG_ONLY_KEYWORDS → 'MDGs' di semua tahun") + self.logger.info(f" Indikator DI SDG_ONLY_KEYWORDS:") + self.logger.info(f" year < {self.sdg_transition_year} → 'MDGs' (data tetap ada, tidak dihapus)") + self.logger.info(f" year >= {self.sdg_transition_year} → 'SDGs'") + self.logger.info(f" {'─'*72}") + self.logger.info(f"\n Verifikasi framework per indikator:") - self.logger.info(f" {'-'*110}") + self.logger.info(f" {'─'*115}") self.logger.info( - f" {'ID':<5} {'Indicator Name':<52} {'Data From':<12} " - f"{'MDGs rows':<12} {'SDGs rows':<12} {'Note'}" + f" {'ID':<5} {'Indicator Name':<52} {'Data From':<11} " + f"{'MDGs rows':<11} {'SDGs rows':<11} {'Note'}" ) - self.logger.info(f" {'-'*110}") + self.logger.info(f" {'─'*115}") for ind_id, grp in self.df_clean.groupby('indicator_id'): ind_name = grp['indicator_name'].iloc[0] @@ -593,13 +613,17 @@ class AnalyticalLayerLoader: data_from = int(grp['year'].min()) if is_sdg_only: - note = f"SDGs from {self.sdg_transition_year}, MDGs before" + mdgs_yrs = sorted(grp[grp['framework'] == 'MDGs']['year'].unique()) + sdgs_yrs = sorted(grp[grp['framework'] == 'SDGs']['year'].unique()) + yr_range_mdgs = f"{min(mdgs_yrs)}-{max(mdgs_yrs)}" if mdgs_yrs else "-" + yr_range_sdgs = f"{min(sdgs_yrs)}-{max(sdgs_yrs)}" if sdgs_yrs else "-" + note = f"MDGs:{yr_range_mdgs} | SDGs:{yr_range_sdgs}" else: note = "MDGs always" self.logger.info( - f" {int(ind_id):<5} {ind_name[:50]:<52} {data_from:<12} " - f"{mdgs_rows:<12} {sdgs_rows:<12} {note}" + f" {int(ind_id):<5} {ind_name[:50]:<52} {data_from:<11} " + f"{mdgs_rows:<11} {sdgs_rows:<11} {note}" ) fw_summary = self.df_clean['framework'].value_counts() @@ -978,12 +1002,13 @@ class AnalyticalLayerLoader: 'end_year' : self.end_year, 'baseline_year' : self.baseline_year, 'sdg_transition_year' : self.sdg_transition_year, + 'sdg_transition_source' : 'HARDCODE — SDGs resmi berlaku 1 Jan 2016', 'fixed_countries' : len(self.selected_country_ids), 'norm_scale' : '1-100 per indicator global minmax direction-aware', 'framework_logic' : ( - 'sdg_transition_year = min(actual_start_year) dari SDG-only indicators; ' - 'SDG-only year >= sdg_transition_year → SDGs; ' - 'SDG-only year < sdg_transition_year → MDGs (data tetap ada); ' + f'SDG_TRANSITION_YEAR={SDG_TRANSITION_YEAR} (HARDCODE); ' + 'SDG-only + year >= SDG_TRANSITION_YEAR → SDGs; ' + 'SDG-only + year < SDG_TRANSITION_YEAR → MDGs (data tetap ada); ' 'non-SDG-only → MDGs selalu' ), 'sdg_only_keywords_count': len(SDG_ONLY_KEYWORDS), @@ -1022,8 +1047,8 @@ class AnalyticalLayerLoader: self.logger.info("Kolom baru: norm_value_1_100 (min-max 1-100, direction-aware)") self.logger.info(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}") self.logger.info( - "Framework: SDG-only indicators → SDGs mulai sdg_transition_year, " - "MDGs sebelumnya (data tetap ada). Non-SDG-only → MDGs selalu." + f"Framework: SDG_TRANSITION_YEAR={SDG_TRANSITION_YEAR} (HARDCODE). " + "SDG-only + year >= 2016 → SDGs; sebelumnya MDGs. Non-SDG-only → MDGs selalu." ) self.logger.info("=" * 80) @@ -1032,7 +1057,7 @@ class AnalyticalLayerLoader: self.filter_complete_indicators_per_country() self.select_countries_with_all_pillars() self.filter_indicators_consistent_across_fixed_countries() - self.determine_sdg_start_year() + self.assign_framework() self.verify_no_gaps() self.calculate_norm_value() self.calculate_yoy() @@ -1047,7 +1072,7 @@ class AnalyticalLayerLoader: self.logger.info("=" * 80) self.logger.info(f" Duration : {duration:.2f}s") self.logger.info(f" Year Range : {self.start_year}-{self.end_year}") - self.logger.info(f" SDG Transition Year: {self.sdg_transition_year}") + self.logger.info(f" SDG Transition Year: {self.sdg_transition_year} (HARDCODE)") self.logger.info(f" Countries : {len(self.selected_country_ids)}") self.logger.info(f" Indicators : {self.df_clean['indicator_id'].nunique()}") self.logger.info(f" Rows Loaded : {self.pipeline_metadata['rows_loaded']:,}") @@ -1076,8 +1101,8 @@ if __name__ == "__main__": print(f"Norm: min-max 1-100 per indicator, direction-aware") print(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}") print( - "Framework: SDG-only → SDGs mulai sdg_transition_year, MDGs sebelumnya. " - "Non-SDG-only → MDGs selalu." + f"Framework: SDG_TRANSITION_YEAR={SDG_TRANSITION_YEAR} (HARDCODE). " + "SDG-only + year >= 2016 → SDGs; sebelumnya MDGs. Non-SDG-only → MDGs selalu." ) print("=" * 80) @@ -1088,6 +1113,6 @@ if __name__ == "__main__": print("\n" + "=" * 80) print("[OK] COMPLETED") - print(f" SDG Transition Year : {loader.sdg_transition_year}") + print(f" SDG Transition Year : {loader.sdg_transition_year} (HARDCODE)") print(f" Rows Loaded : {loader.pipeline_metadata['rows_loaded']:,}") print("=" * 80) \ No newline at end of file