SDGS MDGS indicator

This commit is contained in:
Debby
2026-03-28 19:15:24 +07:00
parent 0ffdf40430
commit dc981aacab
4 changed files with 812 additions and 329 deletions

View File

@@ -8,7 +8,15 @@ Filtering Order:
3. Filter complete indicators PER COUNTRY (auto-detect start year, no gaps)
4. Filter countries with ALL pillars (FIXED SET)
5. Filter indicators with consistent presence across FIXED countries
6. Save analytical table (dengan nama/label lengkap untuk Looker Studio)
6. Calculate YoY per indicator per country
7. Save analytical table (dengan nama/label lengkap + kolom framework + YoY untuk Looker Studio)
UPDATED:
- Kolom 'framework' (MDGs/SDGs) dipropagasi dari dim_indicator ke tabel output.
Hal ini memungkinkan Looker Studio melakukan filter/slice berdasarkan framework
tanpa perlu join ulang ke dim_indicator.
- Kolom 'yoy_change' dan 'yoy_pct' ditambahkan untuk analisis Year-over-Year
per indikator per negara langsung di Looker Studio.
"""
import pandas as pd
@@ -46,9 +54,17 @@ class AnalyticalLayerLoader:
1. Complete per country (no gaps from start_year to end_year)
2. Filter countries with all pillars
3. Ensure indicators have consistent country count across all years
4. Save dengan kolom lengkap (nama + ID) untuk kemudahan Looker Studio
4. Calculate YoY (year-over-year) change per indicator per country
5. Save dengan kolom lengkap (nama + ID + framework + YoY) untuk Looker Studio
Output: fact_asean_food_security_selected -> DW layer (Gold) -> fs_asean_gold
Kolom output:
country_id, country_name,
indicator_id, indicator_name, direction, framework,
pillar_id, pillar_name,
time_id, year, value,
yoy_change, yoy_pct
"""
def __init__(self, client: bigquery.Client):
@@ -87,6 +103,7 @@ class AnalyticalLayerLoader:
self.logger.info("=" * 80)
try:
# Sertakan kolom framework dari dim_indicator dalam query
query = f"""
SELECT
f.country_id,
@@ -94,6 +111,7 @@ class AnalyticalLayerLoader:
f.indicator_id,
i.indicator_name,
i.direction,
i.framework,
f.pillar_id,
p.pillar_name,
f.time_id,
@@ -110,15 +128,34 @@ class AnalyticalLayerLoader:
JOIN `{get_table_id('dim_time', layer='gold')}` t ON f.time_id = t.time_id
"""
self.logger.info("Loading fact table with dimensions...")
self.df_clean = self.client.query(query).result().to_dataframe(create_bqstorage_client=False)
self.logger.info("Loading fact table with dimensions (incl. framework)...")
self.df_clean = self.client.query(query).result().to_dataframe(
create_bqstorage_client=False
)
self.logger.info(f" Loaded: {len(self.df_clean):,} rows")
if 'is_year_range' in self.df_clean.columns:
yr = self.df_clean['is_year_range'].value_counts()
self.logger.info(f" Breakdown:")
self.logger.info(f" Single years (is_year_range=False): {yr.get(False, 0):,}")
self.logger.info(f" Year ranges (is_year_range=True): {yr.get(True, 0):,}")
self.logger.info(
f" Single years (is_year_range=False): {yr.get(False, 0):,}"
)
self.logger.info(
f" Year ranges (is_year_range=True): {yr.get(True, 0):,}"
)
# Validasi kolom framework tersedia
if 'framework' not in self.df_clean.columns:
raise ValueError(
"Kolom 'framework' tidak ditemukan di dim_indicator. "
"Pastikan bigquery_cleaned_layer.py dan bigquery_dimensional_model.py "
"sudah dijalankan dengan versi terbaru."
)
fw_dist = self.df_clean.drop_duplicates('indicator_id')['framework'].value_counts()
self.logger.info(f" Framework distribution (per indikator unik):")
for fw, cnt in fw_dist.items():
self.logger.info(f" {fw}: {cnt} indicators")
self.df_indicator = read_from_bigquery(self.client, 'dim_indicator', layer='gold')
self.df_country = read_from_bigquery(self.client, 'dim_country', layer='gold')
@@ -228,10 +265,15 @@ class AnalyticalLayerLoader:
self.logger.info(f"\n [+] Valid: {len(valid_combinations):,}")
self.logger.info(f" [-] Removed: {len(removed_combinations):,}")
df_valid = pd.DataFrame(valid_combinations)
df_valid['key'] = df_valid['country_id'].astype(str) + '_' + df_valid['indicator_id'].astype(str)
self.df_clean['key'] = (self.df_clean['country_id'].astype(str) + '_' +
self.df_clean['indicator_id'].astype(str))
df_valid = pd.DataFrame(valid_combinations)
df_valid['key'] = (
df_valid['country_id'].astype(str) + '_' +
df_valid['indicator_id'].astype(str)
)
self.df_clean['key'] = (
self.df_clean['country_id'].astype(str) + '_' +
self.df_clean['indicator_id'].astype(str)
)
original_count = len(self.df_clean)
self.df_clean = self.df_clean[self.df_clean['key'].isin(df_valid['key'])].copy()
@@ -265,13 +307,17 @@ class AnalyticalLayerLoader:
f"{row['pillar_count']}/{total_pillars} pillars"
)
selected_countries = country_pillar_count[country_pillar_count['pillar_count'] == total_pillars]
selected_countries = country_pillar_count[
country_pillar_count['pillar_count'] == total_pillars
]
self.selected_country_ids = selected_countries['country_id'].tolist()
self.logger.info(f"\n FIXED SET: {len(self.selected_country_ids)} countries")
original_count = len(self.df_clean)
self.df_clean = self.df_clean[self.df_clean['country_id'].isin(self.selected_country_ids)].copy()
self.df_clean = self.df_clean[
self.df_clean['country_id'].isin(self.selected_country_ids)
].copy()
self.logger.info(f" Rows before: {original_count:,}")
self.logger.info(f" Rows after: {len(self.df_clean):,}")
@@ -285,7 +331,9 @@ class AnalyticalLayerLoader:
indicator_country_start = self.df_clean.groupby([
'indicator_id', 'indicator_name', 'country_id'
])['year'].min().reset_index()
indicator_country_start.columns = ['indicator_id', 'indicator_name', 'country_id', 'start_year']
indicator_country_start.columns = [
'indicator_id', 'indicator_name', 'country_id', 'start_year'
]
indicator_max_start = indicator_country_start.groupby([
'indicator_id', 'indicator_name'
@@ -324,7 +372,9 @@ class AnalyticalLayerLoader:
else:
removed_indicators.append({
'indicator_name': indicator_name,
'reason' : f"missing countries in years: {', '.join(problematic_years[:5])}"
'reason' : (
f"missing countries in years: {', '.join(problematic_years[:5])}"
)
})
self.logger.info(f"\n [+] Valid: {len(valid_indicators)}")
@@ -334,12 +384,17 @@ class AnalyticalLayerLoader:
raise ValueError("No valid indicators found after filtering!")
original_count = len(self.df_clean)
self.df_clean = self.df_clean[self.df_clean['indicator_id'].isin(valid_indicators)].copy()
self.df_clean = self.df_clean[
self.df_clean['indicator_id'].isin(valid_indicators)
].copy()
self.df_clean = self.df_clean.merge(
indicator_max_start[['indicator_id', 'max_start_year']], on='indicator_id', how='left'
indicator_max_start[['indicator_id', 'max_start_year']],
on='indicator_id', how='left'
)
self.df_clean = self.df_clean[self.df_clean['year'] >= self.df_clean['max_start_year']].copy()
self.df_clean = self.df_clean[
self.df_clean['year'] >= self.df_clean['max_start_year']
].copy()
self.df_clean = self.df_clean.drop('max_start_year', axis=1)
self.logger.info(f"\n Rows before: {original_count:,}")
@@ -355,12 +410,16 @@ class AnalyticalLayerLoader:
self.logger.info("=" * 80)
expected_countries = len(self.selected_country_ids)
verification = self.df_clean.groupby(['indicator_id', 'year'])['country_id'].nunique().reset_index()
verification = self.df_clean.groupby(
['indicator_id', 'year']
)['country_id'].nunique().reset_index()
verification.columns = ['indicator_id', 'year', 'country_count']
all_good = (verification['country_count'] == expected_countries).all()
if all_good:
self.logger.info(f" VERIFICATION PASSED — all combinations have {expected_countries} countries")
self.logger.info(
f" VERIFICATION PASSED — all combinations have {expected_countries} countries"
)
else:
bad = verification[verification['country_count'] != expected_countries]
for _, row in bad.head(10).iterrows():
@@ -372,6 +431,101 @@ class AnalyticalLayerLoader:
return True
def calculate_yoy(self):
"""
Hitung Year-over-Year (YoY) per indikator per negara.
Kolom yang ditambahkan ke df_clean:
yoy_change : selisih absolut -> value - value_tahun_sebelumnya
yoy_pct : perubahan relatif -> (yoy_change / abs(value_prev)) * 100
Catatan:
- Baris tahun pertama per kombinasi country-indicator akan bernilai NULL
(tidak ada tahun sebelumnya sebagai pembanding) — ini intentional.
- value_prev di-drop setelah kalkulasi, tidak ikut disimpan ke BigQuery.
- Dilakukan SETELAH verify_no_gaps() agar data sudah clean dan sorted benar.
"""
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 6b: CALCULATE YEAR-OVER-YEAR (YoY) PER INDICATOR PER COUNTRY")
self.logger.info("=" * 80)
df = self.df_clean.sort_values(['country_id', 'indicator_id', 'year']).copy()
# Nilai tahun sebelumnya (shifted within each country-indicator group)
df['value_prev'] = df.groupby(['country_id', 'indicator_id'])['value'].shift(1)
# YoY absolute change: value(t) - value(t-1)
df['yoy_change'] = df['value'] - df['value_prev']
# YoY percentage change: (yoy_change / |value_prev|) * 100
# Hindari division by zero — jika value_prev == 0 atau NaN, hasilnya NaN
df['yoy_pct'] = np.where(
df['value_prev'].notna() & (df['value_prev'] != 0),
(df['yoy_change'] / df['value_prev'].abs()) * 100,
np.nan
)
# Drop kolom bantu value_prev, tidak ikut disimpan ke BigQuery
df = df.drop(columns=['value_prev'])
# Log ringkasan
total_rows = len(df)
valid_yoy = df['yoy_pct'].notna().sum()
null_yoy = df['yoy_pct'].isna().sum()
self.logger.info(f" Total rows : {total_rows:,}")
self.logger.info(f" YoY calculated : {valid_yoy:,}")
self.logger.info(f" YoY NULL (base yr): {null_yoy:,} <- tahun pertama per country-indicator")
# Log distribusi YoY per indikator (sample)
per_ind = (
df[df['yoy_pct'].notna()]
.groupby(['indicator_id', 'indicator_name'])['yoy_pct']
.agg(['mean', 'std', 'min', 'max'])
.reset_index()
)
per_ind.columns = ['indicator_id', 'indicator_name', 'mean', 'std', 'min', 'max']
self.logger.info(f"\n YoY summary per indicator (top 10 by abs mean change):")
self.logger.info(f" {'-'*100}")
self.logger.info(
f" {'ID':<5} {'Indicator Name':<52} {'Mean%':>8} {'Std%':>8} {'Min%':>8} {'Max%':>8}"
)
self.logger.info(f" {'-'*100}")
top_ind = per_ind.reindex(
per_ind['mean'].abs().sort_values(ascending=False).index
).head(10)
for _, row in top_ind.iterrows():
self.logger.info(
f" {int(row['indicator_id']):<5} {row['indicator_name'][:50]:<52} "
f"{row['mean']:>+8.2f} {row['std']:>8.2f} "
f"{row['min']:>+8.2f} {row['max']:>+8.2f}"
)
# Log distribusi YoY per negara (ringkasan)
per_country = (
df[df['yoy_pct'].notna()]
.groupby(['country_id', 'country_name'])['yoy_pct']
.agg(['mean', 'std'])
.reset_index()
)
per_country.columns = ['country_id', 'country_name', 'mean_yoy', 'std_yoy']
self.logger.info(f"\n YoY summary per country:")
self.logger.info(f" {'-'*60}")
self.logger.info(f" {'Country':<30} {'Mean YoY%':>10} {'Std YoY%':>10}")
self.logger.info(f" {'-'*60}")
for _, row in per_country.sort_values('mean_yoy', ascending=False).iterrows():
self.logger.info(
f" {row['country_name']:<30} {row['mean_yoy']:>+10.2f} {row['std_yoy']:>10.2f}"
)
self.df_clean = df
self.logger.info(f"\n [OK] YoY columns added: yoy_change, yoy_pct")
return self.df_clean
def analyze_indicator_availability_by_year(self):
self.logger.info("\n" + "=" * 80)
self.logger.info("STEP 7: ANALYZE INDICATOR AVAILABILITY BY YEAR")
@@ -394,39 +548,62 @@ class AnalyticalLayerLoader:
)
indicator_details = self.df_clean.groupby([
'indicator_id', 'indicator_name', 'pillar_name', 'direction'
'indicator_id', 'indicator_name', 'pillar_name', 'direction', 'framework'
]).agg({'year': ['min', 'max'], 'country_id': 'nunique'}).reset_index()
indicator_details.columns = [
'indicator_id', 'indicator_name', 'pillar_name', 'direction',
'indicator_id', 'indicator_name', 'pillar_name', 'direction', 'framework',
'start_year', 'end_year', 'country_count'
]
indicator_details['year_range'] = (
indicator_details['start_year'].astype(int).astype(str) + '-' +
indicator_details['end_year'].astype(int).astype(str)
)
indicator_details = indicator_details.sort_values(['pillar_name', 'start_year', 'indicator_name'])
indicator_details = indicator_details.sort_values(
['framework', 'pillar_name', 'start_year', 'indicator_name']
)
self.logger.info(f"\nTotal Indicators: {len(indicator_details)}")
for pillar, count in indicator_details.groupby('pillar_name').size().items():
self.logger.info(f" {pillar}: {count} indicators")
self.logger.info(f"\n{'-'*100}")
self.logger.info(f"{'ID':<5} {'Indicator Name':<55} {'Pillar':<15} {'Years':<12} {'Dir':<8} {'Countries'}")
self.logger.info(f"{'-'*100}")
self.logger.info(f"\nFramework breakdown:")
for fw, count in indicator_details.groupby('framework').size().items():
self.logger.info(f" {fw}: {count} indicators")
self.logger.info(f"\n{'-'*110}")
self.logger.info(
f"{'ID':<5} {'Indicator Name':<55} {'Pillar':<15} "
f"{'Framework':<10} {'Years':<12} {'Dir':<8} {'Countries'}"
)
self.logger.info(f"{'-'*110}")
for _, row in indicator_details.iterrows():
direction = 'higher+' if row['direction'] == 'higher_better' else 'lower-'
self.logger.info(
f"{int(row['indicator_id']):<5} {row['indicator_name'][:52]:<55} "
f"{row['pillar_name'][:13]:<15} {row['year_range']:<12} "
f"{direction:<8} {int(row['country_count'])}"
f"{row['pillar_name'][:13]:<15} {row['framework']:<10} "
f"{row['year_range']:<12} {direction:<8} {int(row['country_count'])}"
)
return year_stats
def save_analytical_table(self):
# ---------------------------------------------------------------
# CHANGED: nama tabel baru + kolom lengkap untuk Looker Studio
# ---------------------------------------------------------------
"""
Simpan fact_asean_food_security_selected ke Gold layer.
Kolom yang disimpan:
country_id, country_name — dimensi negara
indicator_id, indicator_name — dimensi indikator
direction — arah penilaian (higher/lower_better)
framework — MDGs / SDGs (untuk filter Looker Studio)
pillar_id, pillar_name — dimensi pilar
time_id, year — dimensi waktu
value — nilai indikator
yoy_change — perubahan absolut YoY (NULLABLE: NULL di tahun pertama)
yoy_pct — perubahan relatif YoY dalam % (NULLABLE: NULL di tahun pertama)
Kolom framework memungkinkan filter langsung di Looker Studio tanpa join ke dim_indicator.
Kolom yoy_change dan yoy_pct memungkinkan analisis tren tahunan langsung di Looker Studio.
"""
table_name = 'fact_asean_food_security_selected'
self.logger.info("\n" + "=" * 80)
@@ -434,22 +611,48 @@ class AnalyticalLayerLoader:
self.logger.info("=" * 80)
try:
# ------------------------------------------------------------------
# Pilih kolom: ID + Nama lengkap + value
# Kolom nama memudahkan filtering/slicing langsung di Looker Studio
# tanpa perlu join ulang ke tabel dimensi.
# ------------------------------------------------------------------
# Pastikan kolom framework tersedia di df_clean
if 'framework' not in self.df_clean.columns:
self.logger.warning(
" [WARN] Kolom 'framework' tidak ada di df_clean. "
"Melakukan join ke dim_indicator sebagai fallback..."
)
dim_ind = read_from_bigquery(self.client, 'dim_indicator', layer='gold')
if 'framework' in dim_ind.columns:
self.df_clean = self.df_clean.merge(
dim_ind[['indicator_id', 'framework']],
on='indicator_id', how='left'
)
self.df_clean['framework'] = self.df_clean['framework'].fillna('MDGs')
self.logger.info(" [OK] framework di-join dari dim_indicator")
else:
self.df_clean['framework'] = 'MDGs'
self.logger.warning(
" [WARN] dim_indicator juga tidak punya kolom framework. "
"Default: MDGs. Jalankan ulang pipeline dari cleaned_layer."
)
# Pastikan kolom YoY tersedia — fallback jika calculate_yoy() tidak dipanggil
if 'yoy_change' not in self.df_clean.columns or 'yoy_pct' not in self.df_clean.columns:
self.logger.warning(
" [WARN] Kolom YoY tidak ditemukan. Menjalankan calculate_yoy() sebagai fallback..."
)
self.calculate_yoy()
analytical_df = self.df_clean[[
'country_id',
'country_name',
'indicator_id',
'indicator_name',
'direction',
'framework',
'pillar_id',
'pillar_name',
'time_id',
'year',
'value',
'yoy_change',
'yoy_pct',
]].copy()
analytical_df = analytical_df.sort_values(
@@ -462,27 +665,46 @@ class AnalyticalLayerLoader:
analytical_df['indicator_id'] = analytical_df['indicator_id'].astype(int)
analytical_df['indicator_name']= analytical_df['indicator_name'].astype(str)
analytical_df['direction'] = analytical_df['direction'].astype(str)
analytical_df['framework'] = analytical_df['framework'].astype(str)
analytical_df['pillar_id'] = analytical_df['pillar_id'].astype(int)
analytical_df['pillar_name'] = analytical_df['pillar_name'].astype(str)
analytical_df['time_id'] = analytical_df['time_id'].astype(int)
analytical_df['year'] = analytical_df['year'].astype(int)
analytical_df['value'] = analytical_df['value'].astype(float)
# yoy_change dan yoy_pct tetap float — NULL (NaN) di tahun pertama adalah intentional
analytical_df['yoy_change'] = analytical_df['yoy_change'].astype(float)
analytical_df['yoy_pct'] = analytical_df['yoy_pct'].astype(float)
self.logger.info(f" Kolom yang disimpan: {list(analytical_df.columns)}")
self.logger.info(f" Total rows: {len(analytical_df):,}")
# Schema BigQuery
# Log distribusi framework
fw_dist = analytical_df.drop_duplicates('indicator_id')['framework'].value_counts()
self.logger.info(f" Framework distribution (per indikator unik):")
for fw, cnt in fw_dist.items():
self.logger.info(f" {fw}: {cnt} indicators")
# Log statistik YoY
yoy_valid = analytical_df['yoy_pct'].notna().sum()
yoy_null = analytical_df['yoy_pct'].isna().sum()
self.logger.info(f" YoY rows (calculated): {yoy_valid:,}")
self.logger.info(f" YoY rows (NULL/base) : {yoy_null:,}")
schema = [
bigquery.SchemaField("country_id", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("country_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("indicator_id", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("indicator_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("direction", "STRING", mode="REQUIRED"),
bigquery.SchemaField("framework", "STRING", mode="REQUIRED"),
bigquery.SchemaField("pillar_id", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("pillar_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("time_id", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("year", "INTEGER", mode="REQUIRED"),
bigquery.SchemaField("value", "FLOAT", mode="REQUIRED"),
# NULLABLE karena tahun pertama per country-indicator tidak memiliki nilai sebelumnya
bigquery.SchemaField("yoy_change", "FLOAT", mode="NULLABLE"),
bigquery.SchemaField("yoy_pct", "FLOAT", mode="NULLABLE"),
]
rows_loaded = load_to_bigquery(
@@ -508,17 +730,25 @@ class AnalyticalLayerLoader:
'fixed_countries': len(self.selected_country_ids),
'no_gaps' : True,
'layer' : 'gold',
'columns' : 'id + name + value (Looker Studio ready)'
'columns' : (
'id + name + direction + framework + value + '
'yoy_change + yoy_pct (Looker Studio ready)'
)
}),
'validation_metrics' : json.dumps({
'fixed_countries' : len(self.selected_country_ids),
'total_indicators': int(self.df_clean['indicator_id'].nunique())
'total_indicators': int(self.df_clean['indicator_id'].nunique()),
'framework_dist' : fw_dist.to_dict(),
'yoy_rows_valid' : int(yoy_valid),
'yoy_rows_null' : int(yoy_null),
})
}
save_etl_metadata(self.client, metadata)
self.logger.info(f"{table_name}: {rows_loaded:,} rows → [DW/Gold] fs_asean_gold")
self.logger.info(f" Metadata → [AUDIT] etl_metadata")
self.logger.info(
f" {table_name}: {rows_loaded:,} rows -> [DW/Gold] fs_asean_gold"
)
self.logger.info(f" Metadata -> [AUDIT] etl_metadata")
return rows_loaded
except Exception as e:
@@ -530,7 +760,9 @@ class AnalyticalLayerLoader:
self.pipeline_metadata['start_time'] = self.pipeline_start
self.logger.info("\n" + "=" * 80)
self.logger.info("Output: fact_asean_food_security_selected fs_asean_gold")
self.logger.info("Output: fact_asean_food_security_selected -> fs_asean_gold")
self.logger.info("Kolom: country_id/name, indicator_id/name, direction, framework,")
self.logger.info(" pillar_id/name, time_id, year, value, yoy_change, yoy_pct")
self.logger.info("=" * 80)
self.load_source_data()
@@ -539,6 +771,7 @@ class AnalyticalLayerLoader:
self.select_countries_with_all_pillars()
self.filter_indicators_consistent_across_fixed_countries()
self.verify_no_gaps()
self.calculate_yoy() # <-- Step 6b: hitung YoY
self.analyze_indicator_availability_by_year()
self.save_analytical_table()
@@ -577,7 +810,7 @@ def run_analytical_layer():
if __name__ == "__main__":
print("=" * 80)
print("Output: fact_asean_food_security_selected fs_asean_gold")
print("Output: fact_asean_food_security_selected -> fs_asean_gold")
print("=" * 80)
logger = setup_logging()