Files
airflow-coolify/dags/etl_food_security.py
2026-04-15 13:37:40 +07:00

174 lines
6.6 KiB
Python

"""
AIRFLOW DAG — ETL Food Security BigQuery
Kimball Data Warehouse Architecture
Schedule : Setiap 3 bulan sekali (tanggal 1, pukul 00:00)
Cron: "0 0 1 */3 *"
-> 1 Jan, 1 Apr, 1 Jul, 1 Okt
Catchup : False
Kimball ETL Flow:
┌──────────────────────────────────────────────────────────────────────────┐
│ BRONZE (Raw) SILVER (Staging→Cleaned) GOLD (DW → Analytical) │
│ │
│ raw_fao ─┐ dim_country │
│ raw_worldbank ─┼→ staging_integrated dim_indicator │
│ raw_unicef ─┘ ↓ dim_time │
│ cleaned_integrated ───────→ dim_source │
│ dim_pillar │
│ fact_food_security │
│ ↓ │
│ analytical_food_security │
│ ↓ │
│ agg_pillar_composite │
│ agg_pillar_by_country │
│ agg_framework_by_country │
│ agg_framework_asean │
│ ↓ │
│ agg_indicator_norm │
│ │
│ AUDIT : etl_logs, etl_metadata (setiap layer) │
└──────────────────────────────────────────────────────────────────────────┘
Task Order:
verify_bigquery_connection
→ load_fao_to_bronze
→ load_worldbank_to_bronze
→ load_unicef_to_bronze
→ staging_integration_to_silver
→ cleaned_integration_to_silver
→ dimensional_model_to_gold
→ analytical_layer_to_gold
→ aggregation_to_gold
→ indicator_norm_aggregation_to_gold
Scripts folder harus berisi:
- bigquery_raw_layer.py (run_verify_connection, run_load_fao, ...)
- bigquery_cleaned_layer.py (run_cleaned_integration)
- bigquery_dimensional_model.py (run_dimensional_model)
- bigquery_analytical_layer.py (run_analytical_layer)
- bigquery_analysis_aggregation.py (run_aggregation)
- bigquery_aggraget_fact_selected_layer.py (run_indicator_norm_aggregation)
- bigquery_config.py
- bigquery_helpers.py
- bigquery_datasource.py
"""
from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime
from scripts.bigquery_raw_layer import (
run_verify_connection,
run_load_fao,
run_load_worldbank,
run_load_unicef,
run_staging_integration,
)
from scripts.bigquery_cleaned_layer import (
run_cleaned_integration,
)
from scripts.bigquery_dimensional_model import (
run_dimensional_model,
)
from scripts.bigquery_analytical_layer import (
run_analytical_layer,
)
# FIXED: nama modul disesuaikan dengan nama file yang benar
from scripts.bigquery_analysis_aggregation import (
run_aggregation,
)
from scripts.bigquery_aggraget_fact_selected_layer import (
run_indicator_norm_aggregation,
)
# DEFAULT ARGS
default_args = {
'owner': 'data-engineering',
'email': ['d1041221004@student.untan.ac.id'],
}
# DAG DEFINITION
#
# schedule_interval = "0 0 1 */3 *"
# ┌───── menit : 0
# │ ┌─── jam : 0 (tengah malam)
# │ │ ┌─ hari : 1 (tanggal 1 setiap bulan yang cocok)
# │ │ │ ┌ bulan : */3 (setiap 3 bulan -> Jan, Apr, Jul, Okt)
# │ │ │ │ ┌ hari minggu : * (semua)
# 0 0 1 */3 *
with DAG(
dag_id = "etl_food_security_bigquery",
description = "Kimball ETL: FAO, World Bank, UNICEF → BigQuery (Bronze → Silver → Gold) | Schedule: setiap 3 bulan",
default_args = default_args,
start_date = datetime(2026, 1, 1),
schedule_interval = "0 0 1 */3 *", # Setiap 3 bulan sekali
catchup = False,
tags = ["food-security", "bigquery", "kimball", "quarterly"],
) as dag:
task_verify = PythonOperator(
task_id = "verify_bigquery_connection",
python_callable = run_verify_connection
)
task_fao = PythonOperator(
task_id = "load_fao_to_bronze",
python_callable = run_load_fao
)
task_worldbank = PythonOperator(
task_id = "load_worldbank_to_bronze",
python_callable = run_load_worldbank
)
task_unicef = PythonOperator(
task_id = "load_unicef_to_bronze",
python_callable = run_load_unicef
)
task_staging = PythonOperator(
task_id = "staging_integration_to_silver",
python_callable = run_staging_integration
)
task_cleaned = PythonOperator(
task_id = "cleaned_integration_to_silver",
python_callable = run_cleaned_integration
)
task_dimensional = PythonOperator(
task_id = "dimensional_model_to_gold",
python_callable = run_dimensional_model
)
task_analytical = PythonOperator(
task_id = "analytical_layer_to_gold",
python_callable = run_analytical_layer
)
task_aggregation = PythonOperator(
task_id = "aggregation_to_gold",
python_callable = run_aggregation
)
task_indicator_norm = PythonOperator(
task_id = "indicator_norm_aggregation_to_gold",
python_callable = run_indicator_norm_aggregation
)
# Task Dependencies
(
task_verify
>> task_fao
>> task_worldbank
>> task_unicef
>> task_staging
>> task_cleaned
>> task_dimensional
>> task_analytical
>> task_aggregation
>> task_indicator_norm
)