sdgs year v4
This commit is contained in:
@@ -8,7 +8,7 @@ Filtering Order:
|
|||||||
3. Filter complete indicators PER COUNTRY (auto-detect start year, no gaps)
|
3. Filter complete indicators PER COUNTRY (auto-detect start year, no gaps)
|
||||||
4. Filter countries with ALL pillars (FIXED SET)
|
4. Filter countries with ALL pillars (FIXED SET)
|
||||||
5. Filter indicators with consistent presence across FIXED countries
|
5. Filter indicators with consistent presence across FIXED countries
|
||||||
6. Determine SDG start year & assign framework (MDGs/SDGs) per indicator PER ROW
|
6. Assign framework (MDGs/SDGs) per indicator PER ROW
|
||||||
7. Verify no gaps
|
7. Verify no gaps
|
||||||
8. Calculate norm_value_1_100 per indicator per country (min-max, direction-aware)
|
8. Calculate norm_value_1_100 per indicator per country (min-max, direction-aware)
|
||||||
9. Calculate YoY per indicator per country
|
9. Calculate YoY per indicator per country
|
||||||
@@ -22,16 +22,17 @@ NORMALISASI (Step 8):
|
|||||||
sehingga nilai antar negara dan antar tahun tetap comparable
|
sehingga nilai antar negara dan antar tahun tetap comparable
|
||||||
- Kolom ini memungkinkan perbandingan antar indikator yang berbeda satuan di Looker Studio
|
- Kolom ini memungkinkan perbandingan antar indikator yang berbeda satuan di Looker Studio
|
||||||
|
|
||||||
FRAMEWORK LOGIC (Row-Level Assignment):
|
FRAMEWORK LOGIC (FIX - Per Indicator, Per Row):
|
||||||
- SDG start year dideteksi dari data: tahun pertama indikator FIES/anaemia lengkap
|
- Framework di-assign PER BARIS dengan mempertimbangkan actual_start_year MASING-MASING
|
||||||
di semua fixed countries (setelah Step 3-5 filter selesai)
|
indikator, bukan satu sdg_start_year global.
|
||||||
- Framework di-assign PER BARIS (per tahun):
|
- Logika:
|
||||||
* year < sdg_start_year → selalu 'MDGs' (semua indikator)
|
* Jika nama indikator TIDAK ada di SDG_ONLY_KEYWORDS → selalu 'MDGs' (semua tahun)
|
||||||
* year >= sdg_start_year + nama di SDG_ONLY_KEYWORDS → 'SDGs'
|
* Jika nama indikator ADA di SDG_ONLY_KEYWORDS:
|
||||||
* selain itu (implisit) → 'MDGs'
|
- row['year'] >= actual_start_year[indicator] → 'SDGs'
|
||||||
- Hanya FIES dan anaemia yang masuk SDG_ONLY_KEYWORDS karena murni baru di era SDGs.
|
- row['year'] < actual_start_year[indicator] → 'MDGs'
|
||||||
- Shared indicators (stunting, wasting, overweight, undernourishment) tidak terdaftar
|
- Baris dengan year < actual_start_year TETAP ADA di data (tidak dihapus di Step 5),
|
||||||
di SDG_ONLY_KEYWORDS sehingga secara implisit selalu berlabel 'MDGs' di semua tahun.
|
hanya mendapat label 'MDGs'.
|
||||||
|
- actual_start_year per indikator = max(min_year per country) setelah Step 3-4 filter
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -61,8 +62,8 @@ from google.cloud import bigquery
|
|||||||
# SDG-ONLY INDICATOR KEYWORDS
|
# SDG-ONLY INDICATOR KEYWORDS
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Hanya indikator yang MURNI BARU di era SDGs yang didaftarkan di sini.
|
# Hanya indikator yang MURNI BARU di era SDGs yang didaftarkan di sini.
|
||||||
# Baris dengan year >= sdg_start_year + nama ada di set ini → 'SDGs'.
|
# Indikator di set ini → 'SDGs' mulai dari actual_start_year indikator tersebut.
|
||||||
# Semua indikator lain (shared maupun tidak dikenal) → 'MDGs' secara implisit.
|
# Semua indikator lain (shared maupun tidak dikenal) → 'MDGs' di semua tahun.
|
||||||
|
|
||||||
SDG_ONLY_KEYWORDS = frozenset([
|
SDG_ONLY_KEYWORDS = frozenset([
|
||||||
# TARGET 2.1.2 — FIES (SDGs only)
|
# TARGET 2.1.2 — FIES (SDGs only)
|
||||||
@@ -83,19 +84,9 @@ SDG_ONLY_KEYWORDS = frozenset([
|
|||||||
"number of women of reproductive age (15-49 years) affected by anemia (million)",
|
"number of women of reproductive age (15-49 years) affected by anemia (million)",
|
||||||
])
|
])
|
||||||
|
|
||||||
# Proxy keywords untuk deteksi era SDGs dari data (indikator murni baru di SDGs)
|
|
||||||
_SDG_ERA_PROXY_KEYWORDS = frozenset([
|
|
||||||
"food insecurity",
|
|
||||||
"anemia",
|
|
||||||
"anaemia",
|
|
||||||
])
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# THRESHOLD KONDISI (fixed absolute, skala 1-100)
|
# THRESHOLD KONDISI (fixed absolute, skala 1-100)
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# bad : norm_value_1_100 < THRESHOLD_BAD
|
|
||||||
# good : norm_value_1_100 > THRESHOLD_GOOD
|
|
||||||
# moderate : di antara keduanya
|
|
||||||
|
|
||||||
THRESHOLD_BAD = 40.0
|
THRESHOLD_BAD = 40.0
|
||||||
THRESHOLD_GOOD = 60.0
|
THRESHOLD_GOOD = 60.0
|
||||||
@@ -104,8 +95,6 @@ THRESHOLD_GOOD = 60.0
|
|||||||
def assign_condition(norm_value_1_100: float) -> str:
|
def assign_condition(norm_value_1_100: float) -> str:
|
||||||
"""
|
"""
|
||||||
Assign kondisi berdasarkan norm_value_1_100 (skala 1-100, sudah direction-aware).
|
Assign kondisi berdasarkan norm_value_1_100 (skala 1-100, sudah direction-aware).
|
||||||
Nilai tinggi selalu berarti lebih baik (lower_better sudah diinvert).
|
|
||||||
|
|
||||||
Returns: 'good' / 'moderate' / 'bad'
|
Returns: 'good' / 'moderate' / 'bad'
|
||||||
"""
|
"""
|
||||||
if pd.isna(norm_value_1_100):
|
if pd.isna(norm_value_1_100):
|
||||||
@@ -117,38 +106,6 @@ def assign_condition(norm_value_1_100: float) -> str:
|
|||||||
return 'moderate'
|
return 'moderate'
|
||||||
|
|
||||||
|
|
||||||
def assign_framework_for_row(
|
|
||||||
indicator_name: str,
|
|
||||||
row_year: int,
|
|
||||||
sdg_start_year: int,
|
|
||||||
) -> str:
|
|
||||||
"""
|
|
||||||
Tentukan framework (MDGs/SDGs) PER BARIS (per tahun).
|
|
||||||
|
|
||||||
Logic:
|
|
||||||
─────────────────────────────────────────────────────────────────────────
|
|
||||||
RULE 1: row_year < sdg_start_year
|
|
||||||
→ selalu 'MDGs', tanpa kecuali.
|
|
||||||
|
|
||||||
RULE 2: row_year >= sdg_start_year AND nama ada di SDG_ONLY_KEYWORDS
|
|
||||||
→ 'SDGs'
|
|
||||||
|
|
||||||
RULE 3 (implisit): semua kondisi lain
|
|
||||||
→ 'MDGs'
|
|
||||||
Ini mencakup shared indicators (stunting, wasting, overweight,
|
|
||||||
undernourishment) yang tidak terdaftar di SDG_ONLY_KEYWORDS,
|
|
||||||
sehingga tidak perlu di-list secara eksplisit.
|
|
||||||
─────────────────────────────────────────────────────────────────────────
|
|
||||||
"""
|
|
||||||
if row_year < sdg_start_year:
|
|
||||||
return 'MDGs'
|
|
||||||
|
|
||||||
if str(indicator_name).lower().strip() in SDG_ONLY_KEYWORDS:
|
|
||||||
return 'SDGs'
|
|
||||||
|
|
||||||
return 'MDGs'
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# ANALYTICAL LAYER CLASS
|
# ANALYTICAL LAYER CLASS
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -162,13 +119,16 @@ class AnalyticalLayerLoader:
|
|||||||
indicator_id, indicator_name, direction, framework,
|
indicator_id, indicator_name, direction, framework,
|
||||||
pillar_id, pillar_name,
|
pillar_id, pillar_name,
|
||||||
time_id, year, value,
|
time_id, year, value,
|
||||||
norm_value_1_100, <- min-max norm per indikator, skala 1-100, direction-aware
|
norm_value_1_100,
|
||||||
yoy_change, yoy_pct
|
yoy_change, yoy_pct
|
||||||
|
|
||||||
FRAMEWORK LOGIC:
|
FRAMEWORK LOGIC (FIX):
|
||||||
- year < sdg_start_year → 'MDGs' (semua indikator)
|
- Indikator TIDAK di SDG_ONLY_KEYWORDS → 'MDGs' di SEMUA tahun
|
||||||
- year >= sdg_start_year + nama di SDG_ONLY_KEYWORDS → 'SDGs' (FIES + anaemia)
|
- Indikator DI SDG_ONLY_KEYWORDS:
|
||||||
- selain itu (implisit) → 'MDGs'
|
year >= actual_start_year[indikator] → 'SDGs'
|
||||||
|
year < actual_start_year[indikator] → 'MDGs'
|
||||||
|
- actual_start_year per indikator = max(min_year per country) setelah Step 3-4 filter
|
||||||
|
- Baris year < actual_start_year TETAP ADA, hanya berlabel 'MDGs'
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, client: bigquery.Client):
|
def __init__(self, client: bigquery.Client):
|
||||||
@@ -182,12 +142,13 @@ class AnalyticalLayerLoader:
|
|||||||
self.df_pillar = None
|
self.df_pillar = None
|
||||||
|
|
||||||
self.selected_country_ids = None
|
self.selected_country_ids = None
|
||||||
|
self.indicator_max_start_map = {} # indicator_id → max_start_year (dari Step 5)
|
||||||
|
|
||||||
self.start_year = 2013
|
self.start_year = 2013
|
||||||
self.end_year = None
|
self.end_year = None
|
||||||
self.baseline_year = 2023 # hardcode per syarat dosen (tahun terlengkap)
|
self.baseline_year = 2023
|
||||||
|
|
||||||
self.sdg_start_year = None
|
self.sdg_start_year = None # disimpan untuk metadata/logging saja
|
||||||
|
|
||||||
self.pipeline_metadata = {
|
self.pipeline_metadata = {
|
||||||
'source_class' : self.__class__.__name__,
|
'source_class' : self.__class__.__name__,
|
||||||
@@ -490,19 +451,22 @@ class AnalyticalLayerLoader:
|
|||||||
if not valid_indicators:
|
if not valid_indicators:
|
||||||
raise ValueError("No valid indicators found after filtering!")
|
raise ValueError("No valid indicators found after filtering!")
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# Filter hanya indikator yang valid
|
||||||
|
# TIDAK menghapus baris year < max_start_year —
|
||||||
|
# semua baris tetap ada, label framework ditentukan di Step 6
|
||||||
|
# ----------------------------------------------------------------
|
||||||
original_count = len(self.df_clean)
|
original_count = len(self.df_clean)
|
||||||
self.df_clean = self.df_clean[
|
self.df_clean = self.df_clean[
|
||||||
self.df_clean['indicator_id'].isin(valid_indicators)
|
self.df_clean['indicator_id'].isin(valid_indicators)
|
||||||
].copy()
|
].copy()
|
||||||
|
|
||||||
self.df_clean = self.df_clean.merge(
|
# Simpan max_start_year sebagai lookup untuk Step 6
|
||||||
indicator_max_start[['indicator_id', 'max_start_year']],
|
self.indicator_max_start_map = (
|
||||||
on='indicator_id', how='left'
|
indicator_max_start[indicator_max_start['indicator_id'].isin(valid_indicators)]
|
||||||
|
.set_index('indicator_id')['max_start_year']
|
||||||
|
.to_dict()
|
||||||
)
|
)
|
||||||
self.df_clean = self.df_clean[
|
|
||||||
self.df_clean['year'] >= self.df_clean['max_start_year']
|
|
||||||
].copy()
|
|
||||||
self.df_clean = self.df_clean.drop('max_start_year', axis=1)
|
|
||||||
|
|
||||||
self.logger.info(f"\n Rows before: {original_count:,}")
|
self.logger.info(f"\n Rows before: {original_count:,}")
|
||||||
self.logger.info(f" Rows after: {len(self.df_clean):,}")
|
self.logger.info(f" Rows after: {len(self.df_clean):,}")
|
||||||
@@ -512,74 +476,123 @@ class AnalyticalLayerLoader:
|
|||||||
return self.df_clean
|
return self.df_clean
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# STEP 6: DETERMINE SDG START YEAR & ASSIGN FRAMEWORK (ROW-LEVEL)
|
# STEP 6: ASSIGN FRAMEWORK PER ROW (per-indicator actual_start_year)
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
def determine_sdg_start_year(self):
|
def determine_sdg_start_year(self):
|
||||||
self.logger.info("\n" + "=" * 80)
|
self.logger.info("\n" + "=" * 80)
|
||||||
self.logger.info("STEP 6: DETERMINE SDG START YEAR & ASSIGN FRAMEWORK (ROW-LEVEL)")
|
self.logger.info("STEP 6: ASSIGN FRAMEWORK PER ROW (per-indicator actual_start_year)")
|
||||||
self.logger.info("=" * 80)
|
self.logger.info("=" * 80)
|
||||||
|
|
||||||
indicator_actual_start = (
|
# ----------------------------------------------------------------
|
||||||
self.df_clean
|
# Hitung actual_start_year PER INDIKATOR dari indicator_max_start_map
|
||||||
.groupby(['indicator_id', 'indicator_name', 'country_id'])['year']
|
# yang sudah dihitung di Step 5.
|
||||||
.min().reset_index()
|
# actual_start_year = max(min_year per country) per indikator
|
||||||
.groupby(['indicator_id', 'indicator_name'])['year']
|
# = tahun di mana semua fixed countries sudah punya data
|
||||||
.max().reset_index()
|
# ----------------------------------------------------------------
|
||||||
)
|
indicator_actual_start = pd.DataFrame([
|
||||||
indicator_actual_start.columns = ['indicator_id', 'indicator_name', 'actual_start_year']
|
{'indicator_id': ind_id, 'actual_start_year': start_yr}
|
||||||
|
for ind_id, start_yr in self.indicator_max_start_map.items()
|
||||||
|
])
|
||||||
|
|
||||||
# Deteksi sdg_start_year dari proxy SDGs-only (FIES & anaemia)
|
# Merge indicator_name untuk keperluan logging
|
||||||
proxy_mask = indicator_actual_start['indicator_name'].str.lower().apply(
|
indicator_actual_start = indicator_actual_start.merge(
|
||||||
lambda n: any(kw in n for kw in _SDG_ERA_PROXY_KEYWORDS)
|
self.df_clean[['indicator_id', 'indicator_name']].drop_duplicates(),
|
||||||
|
on='indicator_id', how='left'
|
||||||
)
|
)
|
||||||
df_proxy = indicator_actual_start[proxy_mask]
|
|
||||||
|
|
||||||
if df_proxy.empty:
|
# Tandai mana yang SDG-only
|
||||||
|
indicator_actual_start['is_sdg_only'] = (
|
||||||
|
indicator_actual_start['indicator_name']
|
||||||
|
.str.lower().str.strip()
|
||||||
|
.isin(SDG_ONLY_KEYWORDS)
|
||||||
|
)
|
||||||
|
|
||||||
|
# sdg_start_year global = min(actual_start_year dari SDG-only indicators)
|
||||||
|
# Disimpan hanya untuk metadata/logging
|
||||||
|
sdg_only_df = indicator_actual_start[indicator_actual_start['is_sdg_only']]
|
||||||
|
if sdg_only_df.empty:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Tidak ada indikator proxy SDGs (FIES/anaemia) yang lolos filter. "
|
"Tidak ada indikator SDG-only (FIES/anaemia) yang lolos filter. "
|
||||||
"Pastikan indikator FIES dan anaemia ada di data."
|
"Pastikan indikator FIES dan anaemia ada di data."
|
||||||
)
|
)
|
||||||
|
self.sdg_start_year = int(sdg_only_df['actual_start_year'].min())
|
||||||
|
|
||||||
self.sdg_start_year = int(df_proxy['actual_start_year'].min())
|
self.logger.info(f"\n SDG-only indicators dan actual_start_year masing-masing:")
|
||||||
self.logger.info(f"\n sdg_start_year = {self.sdg_start_year}")
|
self.logger.info(f" {'-'*80}")
|
||||||
self.logger.info(f" Proxy indicators (penentu sdg_start_year):")
|
for _, row in indicator_actual_start[indicator_actual_start['is_sdg_only']].iterrows():
|
||||||
for _, row in df_proxy.iterrows():
|
|
||||||
self.logger.info(f" [{int(row['actual_start_year'])}] {row['indicator_name']}")
|
|
||||||
|
|
||||||
self.logger.info(f"\n Assigning framework PER ROW...")
|
|
||||||
self.logger.info(f" year < {self.sdg_start_year} → MDGs (semua indikator)")
|
|
||||||
self.logger.info(f" year >= {self.sdg_start_year} + nama in SDG_ONLY_KEYWORDS → SDGs")
|
|
||||||
self.logger.info(f" selain itu (implisit) → MDGs")
|
|
||||||
|
|
||||||
self.df_clean['framework'] = self.df_clean.apply(
|
|
||||||
lambda row: assign_framework_for_row(
|
|
||||||
indicator_name = row['indicator_name'],
|
|
||||||
row_year = int(row['year']),
|
|
||||||
sdg_start_year = self.sdg_start_year,
|
|
||||||
),
|
|
||||||
axis=1
|
|
||||||
)
|
|
||||||
|
|
||||||
# Log ringkasan per indikator untuk verifikasi
|
|
||||||
self.logger.info(f"\n {'Framework Assignment per Indicator':}")
|
|
||||||
self.logger.info(f" {'-'*100}")
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
f" {'ID':<5} {'Indicator Name':<52} "
|
f" [SDG-only] start={int(row['actual_start_year'])} | {row['indicator_name']}"
|
||||||
f"{'Pre-SDG':<10} {'MDGs':<10} {'SDGs':<10} {'SDG-Only?'}"
|
|
||||||
)
|
)
|
||||||
self.logger.info(f" {'-'*100}")
|
self.logger.info(
|
||||||
|
f"\n sdg_start_year (earliest SDG-only, for metadata): {self.sdg_start_year}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Lookup: indicator_id → actual_start_year (hanya SDG-only, untuk logging)
|
||||||
|
sdg_only_start_map = (
|
||||||
|
indicator_actual_start[indicator_actual_start['is_sdg_only']]
|
||||||
|
.set_index('indicator_id')['actual_start_year']
|
||||||
|
.to_dict()
|
||||||
|
)
|
||||||
|
|
||||||
|
self.logger.info(f"\n Logika assign framework (PER BARIS, PER INDIKATOR):")
|
||||||
|
self.logger.info(f" ─────────────────────────────────────────────────────")
|
||||||
|
self.logger.info(f" Jika indikator TIDAK di SDG_ONLY_KEYWORDS:")
|
||||||
|
self.logger.info(f" → 'MDGs' di semua tahun (shared indicators)")
|
||||||
|
self.logger.info(f" Jika indikator DI SDG_ONLY_KEYWORDS:")
|
||||||
|
self.logger.info(f" year >= actual_start_year[indikator] → 'SDGs'")
|
||||||
|
self.logger.info(f" year < actual_start_year[indikator] → 'MDGs'")
|
||||||
|
self.logger.info(f" ─────────────────────────────────────────────────────")
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# Assign framework dengan vectorized merge
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
self.df_clean = self.df_clean.merge(
|
||||||
|
indicator_actual_start[['indicator_id', 'is_sdg_only', 'actual_start_year']],
|
||||||
|
on='indicator_id',
|
||||||
|
how='left'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Assign framework:
|
||||||
|
# - Jika bukan SDG-only → 'MDGs'
|
||||||
|
# - Jika SDG-only AND year >= actual_start_year → 'SDGs'
|
||||||
|
# - Jika SDG-only AND year < actual_start_year → 'MDGs'
|
||||||
|
self.df_clean['framework'] = np.where(
|
||||||
|
self.df_clean['is_sdg_only'] & (self.df_clean['year'] >= self.df_clean['actual_start_year']),
|
||||||
|
'SDGs',
|
||||||
|
'MDGs'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Drop kolom bantu
|
||||||
|
self.df_clean = self.df_clean.drop(columns=['is_sdg_only', 'actual_start_year'])
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# Log verifikasi per indikator
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
self.logger.info(f"\n Verifikasi framework per indikator:")
|
||||||
|
self.logger.info(f" {'-'*105}")
|
||||||
|
self.logger.info(
|
||||||
|
f" {'ID':<5} {'Indicator Name':<52} {'Start':<8} "
|
||||||
|
f"{'MDGs rows':<12} {'SDGs rows':<12} {'Expected'}"
|
||||||
|
)
|
||||||
|
self.logger.info(f" {'-'*105}")
|
||||||
|
|
||||||
for ind_id, grp in self.df_clean.groupby('indicator_id'):
|
for ind_id, grp in self.df_clean.groupby('indicator_id'):
|
||||||
ind_name = grp['indicator_name'].iloc[0]
|
ind_name = grp['indicator_name'].iloc[0]
|
||||||
pre_sdg = (grp['year'] < self.sdg_start_year).sum()
|
|
||||||
mdgs_rows = (grp['framework'] == 'MDGs').sum()
|
mdgs_rows = (grp['framework'] == 'MDGs').sum()
|
||||||
sdgs_rows = (grp['framework'] == 'SDGs').sum()
|
sdgs_rows = (grp['framework'] == 'SDGs').sum()
|
||||||
is_sdg_only = ind_name.lower().strip() in SDG_ONLY_KEYWORDS
|
is_sdg_only = ind_name.lower().strip() in SDG_ONLY_KEYWORDS
|
||||||
|
start_yr = int(grp['year'].min())
|
||||||
|
|
||||||
|
if is_sdg_only:
|
||||||
|
ind_start = sdg_only_start_map.get(ind_id, '?')
|
||||||
|
expected = f"SDGs from {ind_start}, MDGs before"
|
||||||
|
else:
|
||||||
|
expected = "MDGs always"
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
f" {int(ind_id):<5} {ind_name[:50]:<52} "
|
f" {int(ind_id):<5} {ind_name[:50]:<52} {start_yr:<8} "
|
||||||
f"{pre_sdg:<10} {mdgs_rows:<10} {sdgs_rows:<10} "
|
f"{mdgs_rows:<12} {sdgs_rows:<12} {expected}"
|
||||||
f"{'YES' if is_sdg_only else 'no'}"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
fw_summary = self.df_clean['framework'].value_counts()
|
fw_summary = self.df_clean['framework'].value_counts()
|
||||||
@@ -609,23 +622,41 @@ class AnalyticalLayerLoader:
|
|||||||
self.logger.info("STEP 7: VERIFY NO GAPS")
|
self.logger.info("STEP 7: VERIFY NO GAPS")
|
||||||
self.logger.info("=" * 80)
|
self.logger.info("=" * 80)
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# Verifikasi dilakukan PER INDIKATOR dari actual_start_year-nya,
|
||||||
|
# bukan dari self.start_year global, karena tiap indikator bisa
|
||||||
|
# punya start year berbeda.
|
||||||
|
# ----------------------------------------------------------------
|
||||||
expected_countries = len(self.selected_country_ids)
|
expected_countries = len(self.selected_country_ids)
|
||||||
verification = self.df_clean.groupby(
|
all_good = True
|
||||||
['indicator_id', 'year']
|
bad_rows = []
|
||||||
)['country_id'].nunique().reset_index()
|
|
||||||
verification.columns = ['indicator_id', 'year', 'country_count']
|
for ind_id, grp in self.df_clean.groupby('indicator_id'):
|
||||||
all_good = (verification['country_count'] == expected_countries).all()
|
actual_start = self.indicator_max_start_map.get(ind_id)
|
||||||
|
if actual_start is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
expected_years = list(range(int(actual_start), self.end_year + 1))
|
||||||
|
|
||||||
|
for year in expected_years:
|
||||||
|
country_count = grp[grp['year'] == year]['country_id'].nunique()
|
||||||
|
if country_count != expected_countries:
|
||||||
|
all_good = False
|
||||||
|
bad_rows.append({
|
||||||
|
'indicator_id' : int(ind_id),
|
||||||
|
'year' : int(year),
|
||||||
|
'country_count': int(country_count),
|
||||||
|
})
|
||||||
|
|
||||||
if all_good:
|
if all_good:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
f" VERIFICATION PASSED — all combinations have {expected_countries} countries"
|
f" VERIFICATION PASSED — all combinations have {expected_countries} countries"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
bad = verification[verification['country_count'] != expected_countries]
|
for row in bad_rows[:10]:
|
||||||
for _, row in bad.head(10).iterrows():
|
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
f" Indicator {int(row['indicator_id'])}, Year {int(row['year'])}: "
|
f" Indicator {row['indicator_id']}, Year {row['year']}: "
|
||||||
f"{int(row['country_count'])} countries (expected {expected_countries})"
|
f"{row['country_count']} countries (expected {expected_countries})"
|
||||||
)
|
)
|
||||||
raise ValueError("Gap verification failed!")
|
raise ValueError("Gap verification failed!")
|
||||||
|
|
||||||
@@ -638,13 +669,7 @@ class AnalyticalLayerLoader:
|
|||||||
def calculate_norm_value(self):
|
def calculate_norm_value(self):
|
||||||
"""
|
"""
|
||||||
Hitung norm_value_1_100 per indikator — min-max normalisasi skala 1-100,
|
Hitung norm_value_1_100 per indikator — min-max normalisasi skala 1-100,
|
||||||
direction-aware.
|
direction-aware, global per indikator (semua negara + semua tahun).
|
||||||
|
|
||||||
CARA KERJA:
|
|
||||||
- Normalisasi dilakukan GLOBAL per indikator (semua negara + semua tahun sekaligus)
|
|
||||||
sehingga nilai antar negara dan antar tahun tetap comparable.
|
|
||||||
- lower_better diinvert: nilai tinggi selalu = kondisi lebih baik.
|
|
||||||
- Skala 1-100 (bukan 0-100) untuk menghindari nilai absolut nol di Looker Studio.
|
|
||||||
"""
|
"""
|
||||||
self.logger.info("\n" + "=" * 80)
|
self.logger.info("\n" + "=" * 80)
|
||||||
self.logger.info("STEP 8: CALCULATE NORM_VALUE_1_100 PER INDICATOR")
|
self.logger.info("STEP 8: CALCULATE NORM_VALUE_1_100 PER INDICATOR")
|
||||||
@@ -936,9 +961,9 @@ class AnalyticalLayerLoader:
|
|||||||
'fixed_countries' : len(self.selected_country_ids),
|
'fixed_countries' : len(self.selected_country_ids),
|
||||||
'norm_scale' : '1-100 per indicator global minmax direction-aware',
|
'norm_scale' : '1-100 per indicator global minmax direction-aware',
|
||||||
'framework_logic' : (
|
'framework_logic' : (
|
||||||
'row-level: year < sdg_start_year → MDGs always; '
|
'per-indicator actual_start_year: '
|
||||||
'year >= sdg_start_year + SDG_ONLY_KEYWORDS → SDGs; '
|
'SDG-only indicator → SDGs from its own actual_start_year, MDGs before; '
|
||||||
'else (implicit) → MDGs'
|
'shared/other indicators → MDGs always'
|
||||||
),
|
),
|
||||||
'sdg_only_keywords_count' : len(SDG_ONLY_KEYWORDS),
|
'sdg_only_keywords_count' : len(SDG_ONLY_KEYWORDS),
|
||||||
'condition_thresholds' : {
|
'condition_thresholds' : {
|
||||||
@@ -975,7 +1000,7 @@ class AnalyticalLayerLoader:
|
|||||||
self.logger.info("Output: fact_asean_food_security_selected -> fs_asean_gold")
|
self.logger.info("Output: fact_asean_food_security_selected -> fs_asean_gold")
|
||||||
self.logger.info("Kolom baru: norm_value_1_100 (min-max 1-100, direction-aware)")
|
self.logger.info("Kolom baru: norm_value_1_100 (min-max 1-100, direction-aware)")
|
||||||
self.logger.info(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
|
self.logger.info(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
|
||||||
self.logger.info("Framework: year < sdg_start_year → MDGs | SDG_ONLY → SDGs | else → MDGs (implicit)")
|
self.logger.info("Framework: per-indicator actual_start_year (baris year < actual_start_year tetap ada, berlabel MDGs)")
|
||||||
self.logger.info("=" * 80)
|
self.logger.info("=" * 80)
|
||||||
|
|
||||||
self.load_source_data()
|
self.load_source_data()
|
||||||
@@ -1026,7 +1051,7 @@ if __name__ == "__main__":
|
|||||||
print("Output: fact_asean_food_security_selected -> fs_asean_gold")
|
print("Output: fact_asean_food_security_selected -> fs_asean_gold")
|
||||||
print(f"Norm: min-max 1-100 per indicator, direction-aware")
|
print(f"Norm: min-max 1-100 per indicator, direction-aware")
|
||||||
print(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
|
print(f"Condition threshold: bad<{THRESHOLD_BAD}, good>{THRESHOLD_GOOD}")
|
||||||
print(f"Framework: year < sdg_start_year → MDGs | SDG_ONLY → SDGs | else → MDGs (implicit)")
|
print("Framework: per-indicator actual_start_year (baris year < actual_start_year tetap ada, berlabel MDGs)")
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
|
|
||||||
logger = setup_logging()
|
logger = setup_logging()
|
||||||
|
|||||||
Reference in New Issue
Block a user