move sdgs to analytical_layer

This commit is contained in:
Debby
2026-03-31 13:54:20 +07:00
parent 82ce018913
commit ddf15ca9a5
3 changed files with 321 additions and 237 deletions

View File

@@ -52,7 +52,7 @@ class DimensionalModelLoader:
Pipeline steps:
1. Load dim_country
2. Load dim_indicator (+ kolom framework dari cleaned_integrated)
2. Load dim_indicator
3. Load dim_time
4. Load dim_source
5. Load dim_pillar
@@ -313,7 +313,6 @@ class DimensionalModelLoader:
indicator_category — kategori (Health & Nutrition, dll.)
unit — satuan ukuran
direction — higher_better / lower_better
framework — MDGs / SDGs <-- BARU: dibaca dari cleaned_integrated
"""
table_name = 'dim_indicator'
self.load_metadata[table_name]['start_time'] = datetime.now()
@@ -323,7 +322,6 @@ class DimensionalModelLoader:
has_direction = 'direction' in self.df_clean.columns
has_unit = 'unit' in self.df_clean.columns
has_category = 'indicator_category' in self.df_clean.columns
has_framework = 'framework' in self.df_clean.columns
dim_indicator = self.df_clean[['indicator_standardized']].drop_duplicates().copy()
dim_indicator.columns = ['indicator_name']
@@ -381,29 +379,10 @@ class DimensionalModelLoader:
categorize_indicator
)
# Framework — KOLOM BARU
# Dibaca dari cleaned_integrated yang sudah menjalankan assign_framework().
# Jika kolom belum ada (misal pipeline lama), fallback ke 'MDGs' dengan warning.
if has_framework:
fw_map = self.df_clean[
['indicator_standardized', 'framework']
].drop_duplicates()
fw_map.columns = ['indicator_name', 'framework']
dim_indicator = dim_indicator.merge(fw_map, on='indicator_name', how='left')
# Pastikan tidak ada NULL setelah merge
dim_indicator['framework'] = dim_indicator['framework'].fillna('MDGs')
self.logger.info(" [OK] framework column from cleaned_integrated")
else:
dim_indicator['framework'] = 'MDGs'
self.logger.warning(
" [WARN] framework column not found in cleaned_integrated. "
"Default: MDGs. Jalankan bigquery_cleaned_layer.py terlebih dahulu."
)
dim_indicator = dim_indicator.drop_duplicates(subset=['indicator_name'], keep='first')
dim_indicator_final = dim_indicator[
['indicator_name', 'indicator_category', 'unit', 'direction', 'framework']
['indicator_name', 'indicator_category', 'unit', 'direction']
].copy()
dim_indicator_final = dim_indicator_final.reset_index(drop=True)
dim_indicator_final.insert(0, 'indicator_id', range(1, len(dim_indicator_final) + 1))
@@ -414,7 +393,6 @@ class DimensionalModelLoader:
bigquery.SchemaField("indicator_category", "STRING", mode="REQUIRED"),
bigquery.SchemaField("unit", "STRING", mode="NULLABLE"),
bigquery.SchemaField("direction", "STRING", mode="REQUIRED"),
bigquery.SchemaField("framework", "STRING", mode="REQUIRED"),
]
rows_loaded = load_to_bigquery(
@@ -427,7 +405,6 @@ class DimensionalModelLoader:
for label, col in [
('Categories', 'indicator_category'),
('Direction', 'direction'),
('Framework', 'framework'),
]:
self.logger.info(f" {label}:")
for val, cnt in dim_indicator_final[col].value_counts().items():
@@ -766,22 +743,6 @@ class DimensionalModelLoader:
self.logger.info(f" Unique Sources : {int(stats['unique_sources']):>10,}")
self.logger.info(f" Unique Pillars : {int(stats['unique_pillars']):>10,}")
# Validasi distribusi framework di dim_indicator
query_fw = f"""
SELECT framework, COUNT(*) AS count
FROM `{get_table_id('dim_indicator', layer='gold')}`
GROUP BY framework ORDER BY framework
"""
df_fw = self.client.query(query_fw).result().to_dataframe(
create_bqstorage_client=False
)
if len(df_fw) > 0:
self.logger.info(f"\n Framework Distribution (dim_indicator):")
for _, row in df_fw.iterrows():
self.logger.info(
f" {row['framework']:10s}: {int(row['count']):>5,} indicators"
)
query_dir = f"""
SELECT direction, COUNT(*) AS count
FROM `{get_table_id('dim_indicator', layer='gold')}`
@@ -906,10 +867,6 @@ if __name__ == "__main__":
print(f" Year range : {int(df_clean['year'].min())}-{int(df_clean['year'].max())}")
if 'direction' in df_clean.columns:
print(f" Direction : {df_clean['direction'].value_counts().to_dict()}")
if 'framework' in df_clean.columns:
print(f" Framework : {df_clean['framework'].value_counts().to_dict()}")
else:
print(" [WARN] framework column not found — run bigquery_cleaned_layer.py first")
print("\n[1/1] Dimensional Model Load -> DW (Gold)...")
loader = DimensionalModelLoader(client, df_clean)
@@ -917,7 +874,7 @@ if __name__ == "__main__":
print("\n" + "=" * 60)
print("[OK] DIMENSIONAL MODEL ETL COMPLETED")
print(" DW (Gold) : dim_country, dim_indicator (+ framework),")
print(" dim_time, dim_source, dim_pillar, fact_food_security")
print(" DW (Gold) : dim_country, dim_indicator, dim_time,")
print(" dim_source, dim_pillar, fact_food_security")
print(" AUDIT : etl_logs, etl_metadata")
print("=" * 60)