move sdgs to analytical_layer

This commit is contained in:
Debby
2026-03-31 13:54:20 +07:00
parent 82ce018913
commit ddf15ca9a5
3 changed files with 321 additions and 237 deletions

View File

@@ -62,7 +62,6 @@ COLUMN_CONSTRAINTS = {
'unit' : 20,
'pillar' : 20,
'direction' : 15,
'framework' : 5, # 'MDGs'=4, 'SDGs'=4
}
@@ -292,62 +291,6 @@ def assign_direction(indicator_name: str) -> str:
return 'higher_better'
# =============================================================================
# FRAMEWORK CLASSIFICATION (MDGs vs SDGs)
# =============================================================================
# Daftar keyword eksplisit dari SDG Goal 2 Khusus FIES (2030 Agenda for Sustainable Development).
# Disimpan lowercase agar matching tidak sensitif terhadap kapitalisasi input.
SDG_INDICATOR_KEYWORDS = frozenset([
"prevalence of severe food insecurity in the total population (percent) (3-year average)",
"prevalence of severe food insecurity in the male adult population (percent) (3-year average)",
"prevalence of severe food insecurity in the female adult population (percent) (3-year average)",
"prevalence of moderate or severe food insecurity in the total population (percent) (3-year average)",
"prevalence of moderate or severe food insecurity in the male adult population (percent) (3-year average)",
"prevalence of moderate or severe food insecurity in the female adult population (percent) (3-year average)",
"number of severely food insecure people (million) (3-year average)",
"number of severely food insecure male adults (million) (3-year average)",
"number of severely food insecure female adults (million) (3-year average)",
"number of moderately or severely food insecure people (million) (3-year average)",
"number of moderately or severely food insecure male adults (million) (3-year average)",
"number of moderately or severely food insecure female adults (million) (3-year average)",
])
def assign_framework(indicator_name: str) -> str:
"""
Assign framework berdasarkan daftar eksplisit indikator SDG Goal 2
dari 2030 Agenda for Sustainable Development (versi Maret 2020).
Logika:
- Lowercase nama indikator input
- Cek apakah ada keyword SDG (lowercase) yang terkandung di dalam nama indikator
- Jika ya -> 'SDGs'
- Jika tidak -> 'MDGs' (indikator FAO/lama yang bukan SDG resmi)
FIX: Bug sebelumnya menggunakan `kw in ind` (cek apakah keyword ada di dalam ind),
padahal seharusnya `kw in ind` sudah benar secara logika — tapi keyword di-set
dengan kapitalisasi campuran sementara `ind` sudah di-lowercase, sehingga
perbandingan selalu gagal. Solusi: simpan keyword dalam lowercase di set,
sehingga `kw in ind` bekerja dengan benar.
Return values: 'MDGs' atau 'SDGs'
Panjang max 4 chars (dalam constraint varchar(5)).
"""
if pd.isna(indicator_name):
return 'MDGs'
# Lowercase input agar matching tidak sensitif terhadap kapitalisasi
ind = str(indicator_name).lower().strip()
# Cek apakah salah satu keyword SDG (sudah lowercase) ada di dalam ind
if any(kw in ind for kw in SDG_INDICATOR_KEYWORDS):
return 'SDGs'
return 'MDGs'
# =============================================================================
# CLEANED DATA LOADER
# =============================================================================
@@ -365,7 +308,7 @@ class CleanedDataLoader:
1. Standardize country names (ASEAN)
2. Remove missing values
3. Remove duplicates
4. Add pillar, direction & framework classification
4. Add pillar & direction classification
5. Apply column constraints
6. Load ke BigQuery
7. Log ke Audit layer
@@ -382,7 +325,6 @@ class CleanedDataLoader:
bigquery.SchemaField("unit", "STRING", mode="NULLABLE"),
bigquery.SchemaField("pillar", "STRING", mode="REQUIRED"),
bigquery.SchemaField("direction", "STRING", mode="REQUIRED"),
bigquery.SchemaField("framework", "STRING", mode="REQUIRED"),
]
def __init__(self, client: bigquery.Client, load_mode: str = 'full_refresh'):
@@ -449,12 +391,11 @@ class CleanedDataLoader:
return df_clean
def _step_add_classifications(self, df: pd.DataFrame) -> pd.DataFrame:
print("\n [Step 4/5] Add pillar, direction & framework classification...")
print("\n [Step 4/5] Add pillar & direction classification...")
df = df.copy()
df['pillar'] = df['indicator_standardized'].apply(assign_pillar)
df['direction'] = df['indicator_standardized'].apply(assign_direction)
df['framework'] = df['indicator_standardized'].apply(assign_framework)
pillar_counts = df['pillar'].value_counts()
print(f" Pillar distribution:")
@@ -467,21 +408,6 @@ class CleanedDataLoader:
pct = count / len(df) * 100
print(f" - {direction}: {count:,} ({pct:.1f}%)")
framework_counts = df['framework'].value_counts()
print(f" Framework distribution:")
for fw, count in framework_counts.items():
pct = count / len(df) * 100
print(f" - {fw}: {count:,} ({pct:.1f}%)")
# Log indikator yang terklasifikasi SDGs untuk verifikasi
sdg_inds = (
df[df['framework'] == 'SDGs']['indicator_standardized']
.drop_duplicates().sort_values().tolist()
)
print(f"\n SDG indicators ({len(sdg_inds)}):")
for ind in sdg_inds:
print(f" - {ind}")
return df
def _step_apply_constraints(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -506,7 +432,7 @@ class CleanedDataLoader:
'max' : int(df['year'].max()) if not df['year'].isnull().all() else None,
'unique_years': int(df['year'].nunique())
}
for col in ('pillar', 'direction', 'framework', 'source'):
for col in ('pillar', 'direction', 'source'):
if col in df.columns:
validation[f'{col}_breakdown'] = {
str(k): int(v) for k, v in df[col].value_counts().to_dict().items()