move sdgs to analytical_layer
This commit is contained in:
@@ -62,7 +62,6 @@ COLUMN_CONSTRAINTS = {
|
||||
'unit' : 20,
|
||||
'pillar' : 20,
|
||||
'direction' : 15,
|
||||
'framework' : 5, # 'MDGs'=4, 'SDGs'=4
|
||||
}
|
||||
|
||||
|
||||
@@ -292,62 +291,6 @@ def assign_direction(indicator_name: str) -> str:
|
||||
return 'higher_better'
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# FRAMEWORK CLASSIFICATION (MDGs vs SDGs)
|
||||
# =============================================================================
|
||||
|
||||
# Daftar keyword eksplisit dari SDG Goal 2 Khusus FIES (2030 Agenda for Sustainable Development).
|
||||
# Disimpan lowercase agar matching tidak sensitif terhadap kapitalisasi input.
|
||||
|
||||
SDG_INDICATOR_KEYWORDS = frozenset([
|
||||
"prevalence of severe food insecurity in the total population (percent) (3-year average)",
|
||||
"prevalence of severe food insecurity in the male adult population (percent) (3-year average)",
|
||||
"prevalence of severe food insecurity in the female adult population (percent) (3-year average)",
|
||||
"prevalence of moderate or severe food insecurity in the total population (percent) (3-year average)",
|
||||
"prevalence of moderate or severe food insecurity in the male adult population (percent) (3-year average)",
|
||||
"prevalence of moderate or severe food insecurity in the female adult population (percent) (3-year average)",
|
||||
"number of severely food insecure people (million) (3-year average)",
|
||||
"number of severely food insecure male adults (million) (3-year average)",
|
||||
"number of severely food insecure female adults (million) (3-year average)",
|
||||
"number of moderately or severely food insecure people (million) (3-year average)",
|
||||
"number of moderately or severely food insecure male adults (million) (3-year average)",
|
||||
"number of moderately or severely food insecure female adults (million) (3-year average)",
|
||||
])
|
||||
|
||||
|
||||
def assign_framework(indicator_name: str) -> str:
|
||||
"""
|
||||
Assign framework berdasarkan daftar eksplisit indikator SDG Goal 2
|
||||
dari 2030 Agenda for Sustainable Development (versi Maret 2020).
|
||||
|
||||
Logika:
|
||||
- Lowercase nama indikator input
|
||||
- Cek apakah ada keyword SDG (lowercase) yang terkandung di dalam nama indikator
|
||||
- Jika ya -> 'SDGs'
|
||||
- Jika tidak -> 'MDGs' (indikator FAO/lama yang bukan SDG resmi)
|
||||
|
||||
FIX: Bug sebelumnya menggunakan `kw in ind` (cek apakah keyword ada di dalam ind),
|
||||
padahal seharusnya `kw in ind` sudah benar secara logika — tapi keyword di-set
|
||||
dengan kapitalisasi campuran sementara `ind` sudah di-lowercase, sehingga
|
||||
perbandingan selalu gagal. Solusi: simpan keyword dalam lowercase di set,
|
||||
sehingga `kw in ind` bekerja dengan benar.
|
||||
|
||||
Return values: 'MDGs' atau 'SDGs'
|
||||
Panjang max 4 chars (dalam constraint varchar(5)).
|
||||
"""
|
||||
if pd.isna(indicator_name):
|
||||
return 'MDGs'
|
||||
|
||||
# Lowercase input agar matching tidak sensitif terhadap kapitalisasi
|
||||
ind = str(indicator_name).lower().strip()
|
||||
|
||||
# Cek apakah salah satu keyword SDG (sudah lowercase) ada di dalam ind
|
||||
if any(kw in ind for kw in SDG_INDICATOR_KEYWORDS):
|
||||
return 'SDGs'
|
||||
|
||||
return 'MDGs'
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLEANED DATA LOADER
|
||||
# =============================================================================
|
||||
@@ -365,7 +308,7 @@ class CleanedDataLoader:
|
||||
1. Standardize country names (ASEAN)
|
||||
2. Remove missing values
|
||||
3. Remove duplicates
|
||||
4. Add pillar, direction & framework classification
|
||||
4. Add pillar & direction classification
|
||||
5. Apply column constraints
|
||||
6. Load ke BigQuery
|
||||
7. Log ke Audit layer
|
||||
@@ -382,7 +325,6 @@ class CleanedDataLoader:
|
||||
bigquery.SchemaField("unit", "STRING", mode="NULLABLE"),
|
||||
bigquery.SchemaField("pillar", "STRING", mode="REQUIRED"),
|
||||
bigquery.SchemaField("direction", "STRING", mode="REQUIRED"),
|
||||
bigquery.SchemaField("framework", "STRING", mode="REQUIRED"),
|
||||
]
|
||||
|
||||
def __init__(self, client: bigquery.Client, load_mode: str = 'full_refresh'):
|
||||
@@ -449,12 +391,11 @@ class CleanedDataLoader:
|
||||
return df_clean
|
||||
|
||||
def _step_add_classifications(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
print("\n [Step 4/5] Add pillar, direction & framework classification...")
|
||||
print("\n [Step 4/5] Add pillar & direction classification...")
|
||||
df = df.copy()
|
||||
|
||||
df['pillar'] = df['indicator_standardized'].apply(assign_pillar)
|
||||
df['direction'] = df['indicator_standardized'].apply(assign_direction)
|
||||
df['framework'] = df['indicator_standardized'].apply(assign_framework)
|
||||
|
||||
pillar_counts = df['pillar'].value_counts()
|
||||
print(f" Pillar distribution:")
|
||||
@@ -467,21 +408,6 @@ class CleanedDataLoader:
|
||||
pct = count / len(df) * 100
|
||||
print(f" - {direction}: {count:,} ({pct:.1f}%)")
|
||||
|
||||
framework_counts = df['framework'].value_counts()
|
||||
print(f" Framework distribution:")
|
||||
for fw, count in framework_counts.items():
|
||||
pct = count / len(df) * 100
|
||||
print(f" - {fw}: {count:,} ({pct:.1f}%)")
|
||||
|
||||
# Log indikator yang terklasifikasi SDGs untuk verifikasi
|
||||
sdg_inds = (
|
||||
df[df['framework'] == 'SDGs']['indicator_standardized']
|
||||
.drop_duplicates().sort_values().tolist()
|
||||
)
|
||||
print(f"\n SDG indicators ({len(sdg_inds)}):")
|
||||
for ind in sdg_inds:
|
||||
print(f" - {ind}")
|
||||
|
||||
return df
|
||||
|
||||
def _step_apply_constraints(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
@@ -506,7 +432,7 @@ class CleanedDataLoader:
|
||||
'max' : int(df['year'].max()) if not df['year'].isnull().all() else None,
|
||||
'unique_years': int(df['year'].nunique())
|
||||
}
|
||||
for col in ('pillar', 'direction', 'framework', 'source'):
|
||||
for col in ('pillar', 'direction', 'source'):
|
||||
if col in df.columns:
|
||||
validation[f'{col}_breakdown'] = {
|
||||
str(k): int(v) for k, v in df[col].value_counts().to_dict().items()
|
||||
|
||||
Reference in New Issue
Block a user