SDGS MDGS indicator

This commit is contained in:
Debby
2026-03-28 19:15:24 +07:00
parent 0ffdf40430
commit dc981aacab
4 changed files with 812 additions and 329 deletions

View File

@@ -40,12 +40,12 @@ def load_staging_data(client: bigquery.Client) -> pd.DataFrame:
"""Load data dari staging_integrated (STAGING/Silver layer)."""
print("\nLoading data from staging_integrated (fs_asean_silver)...")
df_staging = read_from_bigquery(client, 'staging_integrated', layer='silver')
print(f" Loaded : {len(df_staging):,} rows")
print(f" Columns : {len(df_staging.columns)}")
print(f" Sources : {df_staging['source'].nunique()}")
print(f" Indicators : {df_staging['indicator_standardized'].nunique()}")
print(f" Countries : {df_staging['country'].nunique()}")
print(f" Year range : {int(df_staging['year'].min())}-{int(df_staging['year'].max())}")
print(f" Loaded : {len(df_staging):,} rows")
print(f" Columns : {len(df_staging.columns)}")
print(f" Sources : {df_staging['source'].nunique()}")
print(f" Indicators : {df_staging['indicator_standardized'].nunique()}")
print(f" Countries : {df_staging['country'].nunique()}")
print(f" Year range : {int(df_staging['year'].min())}-{int(df_staging['year'].max())}")
return df_staging
@@ -53,7 +53,6 @@ def load_staging_data(client: bigquery.Client) -> pd.DataFrame:
# COLUMN CONSTRAINT HELPERS
# =============================================================================
# Schema constraints — semua varchar max lengths
COLUMN_CONSTRAINTS = {
'source' : 20,
'indicator_original' : 255,
@@ -62,7 +61,8 @@ COLUMN_CONSTRAINTS = {
'year_range' : 20,
'unit' : 20,
'pillar' : 20,
'direction' : 15, # 'higher_better'=13, 'lower_better'=12
'direction' : 15,
'framework' : 5, # 'MDGs'=4, 'SDGs'=4
}
@@ -101,11 +101,11 @@ def apply_column_constraints(df: pd.DataFrame) -> pd.DataFrame:
)
if truncation_report:
print("\n Column Truncations Applied:")
print("\n Column Truncations Applied:")
for column, info in truncation_report.items():
print(f" - {column}: {info['count']} values truncated to {info['max_length']} chars")
else:
print("\n No truncations needed — all values within constraints")
print("\n No truncations needed — all values within constraints")
return df_constrained
@@ -156,11 +156,11 @@ def standardize_country_names_asean(df: pd.DataFrame, country_column: str = 'cou
def map_country(country):
if pd.isna(country):
return country
s = str(country).strip()
s = str(country).strip()
mapped = ASEAN_MAPPING.get(s.upper(), s)
return mapped[:100] if len(mapped) > 100 else mapped
original = df_clean[country_column].copy()
original = df_clean[country_column].copy()
df_clean[country_column] = df_clean[country_column].apply(map_country)
changes = {orig: new for orig, new in zip(original, df_clean[country_column]) if orig != new}
@@ -178,7 +178,7 @@ def assign_pillar(indicator_name: str) -> str:
"""
Assign pillar berdasarkan keyword indikator.
Return values: 'Availability', 'Access', 'Utilization', 'Stability', 'Other'
All 20 chars (varchar(20) constraint).
All <= 20 chars (varchar(20) constraint).
"""
if pd.isna(indicator_name):
return 'Other'
@@ -210,8 +210,9 @@ def assign_pillar(indicator_name: str) -> str:
if any(kw in ind for kw in [
'wasting', 'wasted', 'stunted', 'overweight', 'obese', 'obesity',
'anemia', 'birthweight', 'breastfeeding', 'drinking water', 'sanitation',
'children under 5', 'newborns with low', 'women of reproductive'
'anemia', 'anaemia', 'birthweight', 'breastfeeding', 'drinking water',
'sanitation', 'children under 5', 'newborns with low',
'women of reproductive'
]):
return 'Utilization'
@@ -226,17 +227,15 @@ def assign_direction(indicator_name: str) -> str:
"""
Assign direction berdasarkan indikator.
Return values: 'higher_better' (13 chars) atau 'lower_better' (12 chars)
Both 15 chars (varchar(15) constraint).
Both <= 15 chars (varchar(15) constraint).
"""
if pd.isna(indicator_name):
return 'higher_better'
ind = str(indicator_name).lower()
# Spesifik lower_better
if 'share of dietary energy supply derived from cereals' in ind:
return 'lower_better'
# Higher_better exceptions — cek sebelum lower_better keywords
for kw in [
'exclusive breastfeeding',
'dietary energy supply',
@@ -248,7 +247,6 @@ def assign_direction(indicator_name: str) -> str:
if kw in ind:
return 'higher_better'
# Lower_better — masalah yang harus diminimalkan
for kw in [
'prevalence of undernourishment',
'prevalence of severe food insecurity',
@@ -259,6 +257,7 @@ def assign_direction(indicator_name: str) -> str:
'prevalence of overweight',
'prevalence of obesity',
'prevalence of anemia',
'prevalence of anaemia',
'prevalence of low birthweight',
'number of people undernourished',
'number of severely food insecure',
@@ -283,6 +282,9 @@ def assign_direction(indicator_name: str) -> str:
'coefficient of variation',
'incidence of caloric losses',
'food losses',
'indicator of food price anomalies',
'proportion of local breeds classified as being at risk',
'agricultural export subsidies',
]:
if kw in ind:
return 'lower_better'
@@ -290,6 +292,73 @@ def assign_direction(indicator_name: str) -> str:
return 'higher_better'
# =============================================================================
# FRAMEWORK CLASSIFICATION (MDGs vs SDGs)
# =============================================================================
# Daftar keyword eksplisit dari SDG Goal 2 (2030 Agenda for Sustainable Development)
# Sumber: UN SDG Indicators — versi Maret 2020
# Indikator: 2.1.1, 2.1.2, 2.2.1, 2.2.2, 2.2.3, 2.3.1, 2.3.2, 2.4.1,
# 2.5.1, 2.5.2, 2.a.1, 2.a.2, 2.b.1, 2.c.1
SDG_INDICATOR_KEYWORDS = frozenset([
# 2.1.1 — Prevalence of undernourishment
"prevalence of undernourishment",
# 2.1.2 — Prevalence of moderate or severe food insecurity (FIES)
"prevalence of moderate or severe food insecurity",
"prevalence of severe food insecurity",
"prevalence of moderate food insecurity",
# 2.2.1 — Prevalence of stunting
"prevalence of stunting",
# 2.2.2 — Prevalence of malnutrition (wasting and overweight)
"prevalence of malnutrition",
"prevalence of wasting",
"prevalence of overweight",
# 2.2.3 — Prevalence of anaemia in women 15-49
"prevalence of anaemia",
"prevalence of anemia",
# 2.3.1 — Volume of production per labour unit
"volume of production per labour unit",
# 2.3.2 — Average income of small-scale food producers
"average income of small-scale food producers",
# 2.4.1 — Proportion of agricultural area under productive and sustainable agriculture
"proportion of agricultural area under productive",
# 2.5.1 — Number of plant and animal genetic resources secured
"number of plant and animal genetic resources",
# 2.5.2 — Proportion of local breeds at risk of extinction
"proportion of local breeds classified as being at risk",
# 2.a.1 — Agriculture orientation index for government expenditures
"agriculture orientation index",
# 2.a.2 — Total official flows to the agriculture sector
"total official flows",
# 2.b.1 — Agricultural export subsidies
"agricultural export subsidies",
# 2.c.1 — Indicator of food price anomalies
"indicator of food price anomalies",
])
def assign_framework(indicator_name: str) -> str:
"""
Assign framework berdasarkan daftar eksplisit indikator SDG Goal 2
dari 2030 Agenda for Sustainable Development (versi Maret 2020).
Logika:
- Cek apakah nama indikator mengandung keyword SDG yang terdaftar
- Jika ya -> 'SDGs'
- Jika tidak -> 'MDGs' (indikator FAO/lama yang bukan SDG resmi)
Return values: 'MDGs' atau 'SDGs'
Panjang max 4 chars (dalam constraint varchar(5)).
"""
if pd.isna(indicator_name):
return 'MDGs'
ind = str(indicator_name).lower().strip()
for kw in SDG_INDICATOR_KEYWORDS:
if kw in ind:
return 'SDGs'
return 'MDGs'
# =============================================================================
# CLEANED DATA LOADER
# =============================================================================
@@ -299,19 +368,18 @@ class CleanedDataLoader:
Loader untuk cleaned integrated data ke STAGING layer (Silver).
Kimball context:
Input : staging_integrated STAGING (Silver) — fs_asean_silver
Output : cleaned_integrated STAGING (Silver) — fs_asean_silver
Audit : etl_logs, etl_metadata AUDIT — fs_asean_audit
Input : staging_integrated -> STAGING (Silver) — fs_asean_silver
Output : cleaned_integrated -> STAGING (Silver) — fs_asean_silver
Audit : etl_logs, etl_metadata -> AUDIT — fs_asean_audit
Pipeline steps:
1. Standardize country names (ASEAN)
2. Remove missing values
3. Remove duplicates
4. Add pillar classification
5. Add direction classification
6. Apply column constraints
7. Load ke BigQuery
8. Log ke Audit layer
4. Add pillar, direction & framework classification
5. Apply column constraints
6. Load ke BigQuery
7. Log ke Audit layer
"""
SCHEMA = [
@@ -325,6 +393,7 @@ class CleanedDataLoader:
bigquery.SchemaField("unit", "STRING", mode="NULLABLE"),
bigquery.SchemaField("pillar", "STRING", mode="REQUIRED"),
bigquery.SchemaField("direction", "STRING", mode="REQUIRED"),
bigquery.SchemaField("framework", "STRING", mode="REQUIRED"),
]
def __init__(self, client: bigquery.Client, load_mode: str = 'full_refresh'):
@@ -355,7 +424,7 @@ class CleanedDataLoader:
def _step_standardize_countries(self, df: pd.DataFrame) -> pd.DataFrame:
print("\n [Step 1/5] Standardize country names...")
df, report = standardize_country_names_asean(df, country_column='country')
print(f" ASEAN countries mapped : {report['countries_mapped']}")
print(f" ASEAN countries mapped : {report['countries_mapped']}")
unique_countries = sorted(df['country'].unique())
print(f" Countries ({len(unique_countries)}) : {', '.join(unique_countries)}")
log_update(self.client, 'STAGING', 'staging_integrated',
@@ -377,7 +446,9 @@ class CleanedDataLoader:
def _step_remove_duplicates(self, df: pd.DataFrame) -> pd.DataFrame:
print("\n [Step 3/5] Remove duplicates...")
exact_dups = df.duplicated().sum()
data_dups = df.duplicated(subset=['indicator_standardized', 'country', 'year', 'value']).sum()
data_dups = df.duplicated(
subset=['indicator_standardized', 'country', 'year', 'value']
).sum()
print(f" Exact duplicates : {exact_dups:,}")
print(f" Data duplicates : {data_dups:,}")
rows_before = len(df)
@@ -389,21 +460,39 @@ class CleanedDataLoader:
return df_clean
def _step_add_classifications(self, df: pd.DataFrame) -> pd.DataFrame:
print("\n [Step 4/5] Add pillar & direction classification...")
print("\n [Step 4/5] Add pillar, direction & framework classification...")
df = df.copy()
df['pillar'] = df['indicator_standardized'].apply(assign_pillar)
df['direction'] = df['indicator_standardized'].apply(assign_direction)
df['framework'] = df['indicator_standardized'].apply(assign_framework)
pillar_counts = df['pillar'].value_counts()
print(f" Pillar distribution:")
print(f" Pillar distribution:")
for pillar, count in pillar_counts.items():
print(f" - {pillar}: {count:,}")
direction_counts = df['direction'].value_counts()
print(f" Direction distribution:")
print(f" Direction distribution:")
for direction, count in direction_counts.items():
pct = count / len(df) * 100
print(f" - {direction}: {count:,} ({pct:.1f}%)")
framework_counts = df['framework'].value_counts()
print(f" Framework distribution:")
for fw, count in framework_counts.items():
pct = count / len(df) * 100
print(f" - {fw}: {count:,} ({pct:.1f}%)")
# Log indikator yang terklasifikasi SDGs untuk verifikasi
sdg_inds = (
df[df['framework'] == 'SDGs']['indicator_standardized']
.drop_duplicates().sort_values().tolist()
)
print(f"\n SDG indicators ({len(sdg_inds)}):")
for ind in sdg_inds:
print(f" - {ind}")
return df
def _step_apply_constraints(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -428,7 +517,7 @@ class CleanedDataLoader:
'max' : int(df['year'].max()) if not df['year'].isnull().all() else None,
'unique_years': int(df['year'].nunique())
}
for col in ('pillar', 'direction', 'source'):
for col in ('pillar', 'direction', 'framework', 'source'):
if col in df.columns:
validation[f'{col}_breakdown'] = {
str(k): int(v) for k, v in df[col].value_counts().to_dict().items()
@@ -438,7 +527,6 @@ class CleanedDataLoader:
if 'country' in df.columns:
validation['unique_countries'] = int(df['country'].nunique())
# Column length check
column_length_check = {}
for col, max_len in COLUMN_CONSTRAINTS.items():
if col in df.columns:
@@ -457,7 +545,7 @@ class CleanedDataLoader:
def run(self, df: pd.DataFrame) -> int:
"""
Execute full cleaning pipeline load ke STAGING (Silver).
Execute full cleaning pipeline -> load ke STAGING (Silver).
Returns:
int: Rows loaded
@@ -469,7 +557,6 @@ class CleanedDataLoader:
print(" ERROR: DataFrame is empty, nothing to process.")
return 0
# Pipeline steps
df = self._step_standardize_countries(df)
df = self._step_remove_missing(df)
df = self._step_remove_duplicates(df)
@@ -478,7 +565,6 @@ class CleanedDataLoader:
self.metadata['rows_transformed'] = len(df)
# Validate
validation = self.validate_data(df)
self.metadata['validation_metrics'] = validation
@@ -487,13 +573,12 @@ class CleanedDataLoader:
for info in validation.get('column_length_check', {}).values()
)
if not all_within_limits:
print("\n WARNING: Some columns still exceed length constraints!")
print("\n WARNING: Some columns still exceed length constraints!")
for col, info in validation['column_length_check'].items():
if not info['within_limit']:
print(f" - {col}: {info['max_actual_length']} > {info['max_length_constraint']}")
# Load ke Silver
print(f"\n Loading to [STAGING/Silver] {self.table_name} → fs_asean_silver...")
print(f"\n Loading to [STAGING/Silver] {self.table_name} -> fs_asean_silver...")
rows_loaded = load_to_bigquery(
self.client, df, self.table_name,
layer='silver',
@@ -502,10 +587,8 @@ class CleanedDataLoader:
)
self.metadata['rows_loaded'] = rows_loaded
# Audit logs
log_update(self.client, 'STAGING', self.table_name, 'full_refresh', rows_loaded)
# ETL metadata
self.metadata['end_time'] = datetime.now()
self.metadata['duration_seconds'] = (
self.metadata['end_time'] - self.metadata['start_time']
@@ -516,33 +599,31 @@ class CleanedDataLoader:
self.metadata['validation_metrics'] = json.dumps(validation)
save_etl_metadata(self.client, self.metadata)
# Summary
print(f"\n ✓ Cleaned Integration completed: {rows_loaded:,} rows")
print(f"\n Cleaned Integration completed: {rows_loaded:,} rows")
print(f" Duration : {self.metadata['duration_seconds']:.2f}s")
print(f" Completeness : {validation['completeness_pct']:.2f}%")
if 'year_range' in validation:
yr = validation['year_range']
if yr['min'] and yr['max']:
print(f" Year range : {yr['min']}{yr['max']}")
print(f" Year range : {yr['min']}-{yr['max']}")
print(f" Indicators : {validation.get('unique_indicators', '-')}")
print(f" Countries : {validation.get('unique_countries', '-')}")
print(f"\n Schema Validation:")
for col, info in validation.get('column_length_check', {}).items():
status = "" if info['within_limit'] else ""
print(f" {status} {col}: {info['max_actual_length']}/{info['max_length_constraint']}")
print(f"\n Metadata [AUDIT] etl_metadata")
status = "OK" if info['within_limit'] else "FAIL"
print(f" [{status}] {col}: {info['max_actual_length']}/{info['max_length_constraint']}")
print(f"\n Metadata -> [AUDIT] etl_metadata")
return rows_loaded
# =============================================================================
# AIRFLOW TASK FUNCTIONS ← sama polanya dengan raw layer
# AIRFLOW TASK FUNCTIONS
# =============================================================================
def run_cleaned_integration():
"""
Airflow task: Load cleaned_integrated dari staging_integrated.
Dipanggil oleh DAG setelah task staging_integration_to_silver selesai.
"""
from scripts.bigquery_config import get_bigquery_client
@@ -561,21 +642,21 @@ if __name__ == "__main__":
print("=" * 60)
print("BIGQUERY CLEANED LAYER ETL")
print("Kimball DW Architecture")
print(" Input : STAGING (Silver) staging_integrated")
print(" Output : STAGING (Silver) cleaned_integrated")
print(" Audit : AUDIT etl_logs, etl_metadata")
print(" Input : STAGING (Silver) -> staging_integrated")
print(" Output : STAGING (Silver) -> cleaned_integrated")
print(" Audit : AUDIT -> etl_logs, etl_metadata")
print("=" * 60)
logger = setup_logging()
client = get_bigquery_client()
df_staging = load_staging_data(client)
print("\n[1/1] Cleaned Integration STAGING (Silver)...")
print("\n[1/1] Cleaned Integration -> STAGING (Silver)...")
loader = CleanedDataLoader(client, load_mode='full_refresh')
final_count = loader.run(df_staging)
print("\n" + "=" * 60)
print(" CLEANED LAYER ETL COMPLETED")
print(f" 🥈 STAGING (Silver) : cleaned_integrated ({final_count:,} rows)")
print(f" 📋 AUDIT : etl_logs, etl_metadata")
print("[OK] CLEANED LAYER ETL COMPLETED")
print(f" STAGING (Silver) : cleaned_integrated ({final_count:,} rows)")
print(f" AUDIT : etl_logs, etl_metadata")
print("=" * 60)