add airflow task functions

This commit is contained in:
Debby
2026-03-07 15:57:38 +07:00
parent f72b803c46
commit 92805b9dbd

View File

@@ -426,16 +426,6 @@ class StagingDataIntegration:
Input : RAW layer (Bronze) — raw_fao, raw_worldbank, raw_unicef
Output : STAGING layer (Silver) — staging_integrated
Audit : etl_logs, etl_metadata (Audit → fs_asean_audit)
Schema staging_integrated:
source varchar(20)
indicator_original varchar(255)
indicator_standardized varchar(255)
country varchar(100)
year int
year_range varchar(20)
value float
unit varchar(20)
"""
def __init__(self, client: bigquery.Client):
@@ -798,4 +788,47 @@ if __name__ == "__main__":
print(f"RAW (Bronze) : raw_fao, raw_worldbank, raw_unicef")
print(f"STAGING (Silver) : staging_integrated")
print(f"AUDIT : etl_logs, etl_metadata")
print("=" * 60)
print("=" * 60)
# AIRFLOW TASK FUNCTIONS
def run_verify_connection():
from scripts.bigquery_config import verify_setup
result = verify_setup()
if not result:
raise Exception("BigQuery connection failed!")
print("BigQuery connection OK")
def run_load_fao():
from scripts.bigquery_config import get_bigquery_client
client = get_bigquery_client()
source = FAODataSource(client)
df = source.run()
print(f"FAO loaded: {len(df):,} rows")
def run_load_worldbank():
from scripts.bigquery_config import get_bigquery_client
client = get_bigquery_client()
fao_source = FAODataSource(client)
df_fao = fao_source.run()
fao_indicators = df_fao['indicator'].unique().tolist()
wb_source = WorldBankDataSource(client, fao_indicators)
df = wb_source.run()
print(f"World Bank loaded: {len(df):,} rows")
def run_load_unicef():
from scripts.bigquery_config import get_bigquery_client
client = get_bigquery_client()
fao_source = FAODataSource(client)
df_fao = fao_source.run()
fao_indicators = df_fao['indicator'].unique().tolist()
unicef_source = UNICEFDataSource(client, fao_indicators)
df = unicef_source.run()
print(f"UNICEF loaded: {len(df):,} rows")
def run_staging_integration():
from scripts.bigquery_config import get_bigquery_client
client = get_bigquery_client()
staging = StagingDataIntegration(client)
df = staging.run()
print(f"Staging integrated: {len(df):,} rows")