move sdgs to analytical_layer

2026-03-31 13:54:20 +07:00
parent 82ce018913
commit ddf15ca9a5
3 changed files with 321 additions and 237 deletions
@@ -62,7 +62,6 @@ COLUMN_CONSTRAINTS = {
    'unit'                  : 20,
    'pillar'                : 20,
    'direction'             : 15,
-    'framework'             : 5,   # 'MDGs'=4, 'SDGs'=4
 }


@@ -292,62 +291,6 @@ def assign_direction(indicator_name: str) -> str:
    return 'higher_better'


-# =============================================================================
-# FRAMEWORK CLASSIFICATION (MDGs vs SDGs)
-# =============================================================================
-
-# Daftar keyword eksplisit dari SDG Goal 2 Khusus FIES (2030 Agenda for Sustainable Development).
-# Disimpan lowercase agar matching tidak sensitif terhadap kapitalisasi input.
-
-SDG_INDICATOR_KEYWORDS = frozenset([
-    "prevalence of severe food insecurity in the total population (percent) (3-year average)",
-    "prevalence of severe food insecurity in the male adult population (percent) (3-year average)",
-    "prevalence of severe food insecurity in the female adult population (percent) (3-year average)",
-    "prevalence of moderate or severe food insecurity in the total population (percent) (3-year average)",
-    "prevalence of moderate or severe food insecurity in the male adult population (percent) (3-year average)",
-    "prevalence of moderate or severe food insecurity in the female adult population (percent) (3-year average)",
-    "number of severely food insecure people (million) (3-year average)",
-    "number of severely food insecure male adults (million) (3-year average)",
-    "number of severely food insecure female adults (million) (3-year average)",
-    "number of moderately or severely food insecure people (million) (3-year average)",
-    "number of moderately or severely food insecure male adults (million) (3-year average)",
-    "number of moderately or severely food insecure female adults (million) (3-year average)",
-])
-
-
-def assign_framework(indicator_name: str) -> str:
-    """
-    Assign framework berdasarkan daftar eksplisit indikator SDG Goal 2
-    dari 2030 Agenda for Sustainable Development (versi Maret 2020).
-
-    Logika:
-    - Lowercase nama indikator input
-    - Cek apakah ada keyword SDG (lowercase) yang terkandung di dalam nama indikator
-    - Jika ya  -> 'SDGs'
-    - Jika tidak -> 'MDGs' (indikator FAO/lama yang bukan SDG resmi)
-
-    FIX: Bug sebelumnya menggunakan `kw in ind` (cek apakah keyword ada di dalam ind),
-    padahal seharusnya `kw in ind` sudah benar secara logika — tapi keyword di-set
-    dengan kapitalisasi campuran sementara `ind` sudah di-lowercase, sehingga
-    perbandingan selalu gagal. Solusi: simpan keyword dalam lowercase di set,
-    sehingga `kw in ind` bekerja dengan benar.
-
-    Return values: 'MDGs' atau 'SDGs'
-    Panjang max 4 chars (dalam constraint varchar(5)).
-    """
-    if pd.isna(indicator_name):
-        return 'MDGs'
-
-    # Lowercase input agar matching tidak sensitif terhadap kapitalisasi
-    ind = str(indicator_name).lower().strip()
-
-    # Cek apakah salah satu keyword SDG (sudah lowercase) ada di dalam ind
-    if any(kw in ind for kw in SDG_INDICATOR_KEYWORDS):
-        return 'SDGs'
-
-    return 'MDGs'
-
-
 # =============================================================================
 # CLEANED DATA LOADER
 # =============================================================================
@@ -365,7 +308,7 @@ class CleanedDataLoader:
        1. Standardize country names (ASEAN)
        2. Remove missing values
        3. Remove duplicates
-        4. Add pillar, direction & framework classification
+        4. Add pillar & direction classification
        5. Apply column constraints
        6. Load ke BigQuery
        7. Log ke Audit layer
@@ -382,7 +325,6 @@ class CleanedDataLoader:
        bigquery.SchemaField("unit",                   "STRING",  mode="NULLABLE"),
        bigquery.SchemaField("pillar",                 "STRING",  mode="REQUIRED"),
        bigquery.SchemaField("direction",              "STRING",  mode="REQUIRED"),
-        bigquery.SchemaField("framework",              "STRING",  mode="REQUIRED"),
    ]

    def __init__(self, client: bigquery.Client, load_mode: str = 'full_refresh'):
@@ -449,12 +391,11 @@ class CleanedDataLoader:
        return df_clean

    def _step_add_classifications(self, df: pd.DataFrame) -> pd.DataFrame:
-        print("\n  [Step 4/5] Add pillar, direction & framework classification...")
+        print("\n  [Step 4/5] Add pillar & direction classification...")
        df = df.copy()

        df['pillar']    = df['indicator_standardized'].apply(assign_pillar)
        df['direction'] = df['indicator_standardized'].apply(assign_direction)
-        df['framework'] = df['indicator_standardized'].apply(assign_framework)

        pillar_counts = df['pillar'].value_counts()
        print(f"    Pillar distribution:")
@@ -467,21 +408,6 @@ class CleanedDataLoader:
            pct = count / len(df) * 100
            print(f"      - {direction}: {count:,} ({pct:.1f}%)")

-        framework_counts = df['framework'].value_counts()
-        print(f"    Framework distribution:")
-        for fw, count in framework_counts.items():
-            pct = count / len(df) * 100
-            print(f"      - {fw}: {count:,} ({pct:.1f}%)")
-
-        # Log indikator yang terklasifikasi SDGs untuk verifikasi
-        sdg_inds = (
-            df[df['framework'] == 'SDGs']['indicator_standardized']
-            .drop_duplicates().sort_values().tolist()
-        )
-        print(f"\n    SDG indicators ({len(sdg_inds)}):")
-        for ind in sdg_inds:
-            print(f"      - {ind}")
-
        return df

    def _step_apply_constraints(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -506,7 +432,7 @@ class CleanedDataLoader:
                'max'         : int(df['year'].max()) if not df['year'].isnull().all() else None,
                'unique_years': int(df['year'].nunique())
            }
-        for col in ('pillar', 'direction', 'framework', 'source'):
+        for col in ('pillar', 'direction', 'source'):
            if col in df.columns:
                validation[f'{col}_breakdown'] = {
                    str(k): int(v) for k, v in df[col].value_counts().to_dict().items()