move sdgs to analytical_layer
This commit is contained in:
@@ -52,7 +52,7 @@ class DimensionalModelLoader:
|
||||
|
||||
Pipeline steps:
|
||||
1. Load dim_country
|
||||
2. Load dim_indicator (+ kolom framework dari cleaned_integrated)
|
||||
2. Load dim_indicator
|
||||
3. Load dim_time
|
||||
4. Load dim_source
|
||||
5. Load dim_pillar
|
||||
@@ -313,7 +313,6 @@ class DimensionalModelLoader:
|
||||
indicator_category — kategori (Health & Nutrition, dll.)
|
||||
unit — satuan ukuran
|
||||
direction — higher_better / lower_better
|
||||
framework — MDGs / SDGs <-- BARU: dibaca dari cleaned_integrated
|
||||
"""
|
||||
table_name = 'dim_indicator'
|
||||
self.load_metadata[table_name]['start_time'] = datetime.now()
|
||||
@@ -323,7 +322,6 @@ class DimensionalModelLoader:
|
||||
has_direction = 'direction' in self.df_clean.columns
|
||||
has_unit = 'unit' in self.df_clean.columns
|
||||
has_category = 'indicator_category' in self.df_clean.columns
|
||||
has_framework = 'framework' in self.df_clean.columns
|
||||
|
||||
dim_indicator = self.df_clean[['indicator_standardized']].drop_duplicates().copy()
|
||||
dim_indicator.columns = ['indicator_name']
|
||||
@@ -381,29 +379,10 @@ class DimensionalModelLoader:
|
||||
categorize_indicator
|
||||
)
|
||||
|
||||
# Framework — KOLOM BARU
|
||||
# Dibaca dari cleaned_integrated yang sudah menjalankan assign_framework().
|
||||
# Jika kolom belum ada (misal pipeline lama), fallback ke 'MDGs' dengan warning.
|
||||
if has_framework:
|
||||
fw_map = self.df_clean[
|
||||
['indicator_standardized', 'framework']
|
||||
].drop_duplicates()
|
||||
fw_map.columns = ['indicator_name', 'framework']
|
||||
dim_indicator = dim_indicator.merge(fw_map, on='indicator_name', how='left')
|
||||
# Pastikan tidak ada NULL setelah merge
|
||||
dim_indicator['framework'] = dim_indicator['framework'].fillna('MDGs')
|
||||
self.logger.info(" [OK] framework column from cleaned_integrated")
|
||||
else:
|
||||
dim_indicator['framework'] = 'MDGs'
|
||||
self.logger.warning(
|
||||
" [WARN] framework column not found in cleaned_integrated. "
|
||||
"Default: MDGs. Jalankan bigquery_cleaned_layer.py terlebih dahulu."
|
||||
)
|
||||
|
||||
dim_indicator = dim_indicator.drop_duplicates(subset=['indicator_name'], keep='first')
|
||||
|
||||
dim_indicator_final = dim_indicator[
|
||||
['indicator_name', 'indicator_category', 'unit', 'direction', 'framework']
|
||||
['indicator_name', 'indicator_category', 'unit', 'direction']
|
||||
].copy()
|
||||
dim_indicator_final = dim_indicator_final.reset_index(drop=True)
|
||||
dim_indicator_final.insert(0, 'indicator_id', range(1, len(dim_indicator_final) + 1))
|
||||
@@ -414,7 +393,6 @@ class DimensionalModelLoader:
|
||||
bigquery.SchemaField("indicator_category", "STRING", mode="REQUIRED"),
|
||||
bigquery.SchemaField("unit", "STRING", mode="NULLABLE"),
|
||||
bigquery.SchemaField("direction", "STRING", mode="REQUIRED"),
|
||||
bigquery.SchemaField("framework", "STRING", mode="REQUIRED"),
|
||||
]
|
||||
|
||||
rows_loaded = load_to_bigquery(
|
||||
@@ -427,7 +405,6 @@ class DimensionalModelLoader:
|
||||
for label, col in [
|
||||
('Categories', 'indicator_category'),
|
||||
('Direction', 'direction'),
|
||||
('Framework', 'framework'),
|
||||
]:
|
||||
self.logger.info(f" {label}:")
|
||||
for val, cnt in dim_indicator_final[col].value_counts().items():
|
||||
@@ -766,22 +743,6 @@ class DimensionalModelLoader:
|
||||
self.logger.info(f" Unique Sources : {int(stats['unique_sources']):>10,}")
|
||||
self.logger.info(f" Unique Pillars : {int(stats['unique_pillars']):>10,}")
|
||||
|
||||
# Validasi distribusi framework di dim_indicator
|
||||
query_fw = f"""
|
||||
SELECT framework, COUNT(*) AS count
|
||||
FROM `{get_table_id('dim_indicator', layer='gold')}`
|
||||
GROUP BY framework ORDER BY framework
|
||||
"""
|
||||
df_fw = self.client.query(query_fw).result().to_dataframe(
|
||||
create_bqstorage_client=False
|
||||
)
|
||||
if len(df_fw) > 0:
|
||||
self.logger.info(f"\n Framework Distribution (dim_indicator):")
|
||||
for _, row in df_fw.iterrows():
|
||||
self.logger.info(
|
||||
f" {row['framework']:10s}: {int(row['count']):>5,} indicators"
|
||||
)
|
||||
|
||||
query_dir = f"""
|
||||
SELECT direction, COUNT(*) AS count
|
||||
FROM `{get_table_id('dim_indicator', layer='gold')}`
|
||||
@@ -906,10 +867,6 @@ if __name__ == "__main__":
|
||||
print(f" Year range : {int(df_clean['year'].min())}-{int(df_clean['year'].max())}")
|
||||
if 'direction' in df_clean.columns:
|
||||
print(f" Direction : {df_clean['direction'].value_counts().to_dict()}")
|
||||
if 'framework' in df_clean.columns:
|
||||
print(f" Framework : {df_clean['framework'].value_counts().to_dict()}")
|
||||
else:
|
||||
print(" [WARN] framework column not found — run bigquery_cleaned_layer.py first")
|
||||
|
||||
print("\n[1/1] Dimensional Model Load -> DW (Gold)...")
|
||||
loader = DimensionalModelLoader(client, df_clean)
|
||||
@@ -917,7 +874,7 @@ if __name__ == "__main__":
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("[OK] DIMENSIONAL MODEL ETL COMPLETED")
|
||||
print(" DW (Gold) : dim_country, dim_indicator (+ framework),")
|
||||
print(" dim_time, dim_source, dim_pillar, fact_food_security")
|
||||
print(" DW (Gold) : dim_country, dim_indicator, dim_time,")
|
||||
print(" dim_source, dim_pillar, fact_food_security")
|
||||
print(" AUDIT : etl_logs, etl_metadata")
|
||||
print("=" * 60)
|
||||
Reference in New Issue
Block a user