raw and staging data

This commit is contained in:
Debby
2026-03-12 14:57:30 +07:00
parent 847a6a9859
commit 0235dfbc75
5 changed files with 30 additions and 219 deletions

View File

@@ -33,8 +33,8 @@ from pathlib import Path
from difflib import SequenceMatcher
import re
from bigquery_config import get_bigquery_client, CONFIG, EXPORTS_DIR, LOGS_DIR, get_table_id
from bigquery_helpers import (
from scripts.bigquery_config import get_bigquery_client, CONFIG, EXPORTS_DIR, LOGS_DIR, get_table_id
from scripts.bigquery_helpers import (
log_update,
load_to_bigquery,
read_from_bigquery,
@@ -42,9 +42,10 @@ from bigquery_helpers import (
save_etl_metadata,
get_staging_schema
)
from bigquery_datasource import DataSource
from scripts.bigquery_datasource import DataSource
from google.cloud import bigquery
# INDICATOR MATCHER
class IndicatorMatcher:
@@ -200,7 +201,7 @@ class IndicatorMatcher:
class FAODataSource(DataSource):
"""
FAO Food Security Data Source (BigQuery version)
FIXED: Menggunakan bulk download karena faostat API butuh autentikasi
Menggunakan bulk download karena faostat API butuh autentikasi
"""
def __init__(self, client: bigquery.Client = None):
@@ -447,28 +448,22 @@ class StagingDataIntegration:
}
def load_raw_data(self) -> Dict[str, pd.DataFrame]:
"""Load data dari semua tabel RAW layer (Bronze)"""
raw_data = {}
try:
raw_data['fao'] = read_from_bigquery(self.client, 'raw_fao', layer='bronze')
except Exception:
raw_data['fao'] = pd.DataFrame()
try:
raw_data['worldbank'] = read_from_bigquery(self.client, 'raw_worldbank', layer='bronze')
except Exception:
raw_data['worldbank'] = pd.DataFrame()
try:
raw_data['unicef'] = read_from_bigquery(self.client, 'raw_unicef', layer='bronze')
except Exception:
raw_data['unicef'] = pd.DataFrame()
return raw_data
def clean_value(self, value):
"""Clean dan convert value ke float"""
if pd.isna(value):
return None
value_str = str(value).strip().replace('<', '').replace('>', '').strip()
@@ -478,18 +473,9 @@ class StagingDataIntegration:
return None
def process_year_range(self, year_value):
"""
Process year range dan return (year_int, year_range_str)
Examples:
"2020" → (2020, "2020")
"2020-2021" → (2020, "2020-2021")
"20192021" → (2020, "2019-2021")
"""
if pd.isna(year_value):
return None, None
year_str = str(year_value).strip().replace('', '-').replace('', '-')
if '-' in year_str:
try:
parts = year_str.split('-')
@@ -509,7 +495,6 @@ class StagingDataIntegration:
return None, year_str
def truncate_string(self, value, max_length: int) -> str:
"""Truncate string sesuai varchar constraint"""
if pd.isna(value):
return ''
s = str(value).strip()
@@ -519,7 +504,6 @@ class StagingDataIntegration:
indicator_orig_col: str, indicator_std_col: str,
country_col: str, year_col: str, value_col: str,
unit_col: str = None) -> pd.DataFrame:
"""Standardize dataframe ke schema staging_integrated"""
if df.empty:
return pd.DataFrame()
@@ -543,10 +527,9 @@ class StagingDataIntegration:
})
def standardize_schema(self, raw_data: Dict[str, pd.DataFrame]) -> pd.DataFrame:
"""Standardize schema dari semua sumber data"""
integrated_data = []
# FAO — deteksi kolom (nama asli atau sudah di-rename)
# FAO
if not raw_data['fao'].empty:
df = raw_data['fao'].copy()
integrated_data.append(self.standardize_dataframe(
@@ -590,11 +573,9 @@ class StagingDataIntegration:
df_integrated = pd.concat(integrated_data, ignore_index=True)
# Final type conversion
df_integrated['year'] = pd.to_numeric(df_integrated['year'], errors='coerce')
df_integrated['value'] = pd.to_numeric(df_integrated['value'], errors='coerce')
# Enforce varchar constraints
for col, max_len in [('source', 20), ('country', 100), ('indicator_original', 255),
('indicator_standardized', 255), ('year_range', 20), ('unit', 20)]:
df_integrated[col] = df_integrated[col].astype(str).apply(
@@ -606,7 +587,6 @@ class StagingDataIntegration:
).reset_index(drop=True)
def validate_data(self, df: pd.DataFrame) -> Dict:
"""Validate data dan return metrics"""
validation = {
'total_rows' : int(len(df)),
'total_columns' : int(len(df.columns)),
@@ -621,15 +601,12 @@ class StagingDataIntegration:
'max' : int(df['year'].max()) if not df['year'].isnull().all() else None,
'unique_years': int(df['year'].nunique())
}
if 'source' in df.columns:
validation['source_breakdown'] = {
str(k): int(v) for k, v in df['source'].value_counts().to_dict().items()
}
if 'indicator_standardized' in df.columns:
validation['unique_indicators'] = int(df['indicator_standardized'].nunique())
if 'country' in df.columns:
validation['unique_countries'] = int(df['country'].nunique())
@@ -645,21 +622,15 @@ class StagingDataIntegration:
return validation
def save_to_staging(self, df: pd.DataFrame):
"""Save data ke staging_integrated table di STAGING layer (Silver)"""
try:
schema = get_staging_schema()
load_to_bigquery(
self.client,
df,
self.staging_table,
layer='silver', # → fs_asean_silver
self.client, df, self.staging_table,
layer='silver',
write_disposition="WRITE_TRUNCATE",
schema=schema
)
log_update(self.client, 'STAGING', self.staging_table, 'full_refresh', len(df))
except Exception as e:
print(f"save_to_staging FAILED: {type(e).__name__}: {e}")
log_update(self.client, 'STAGING', self.staging_table, 'full_refresh', 0,
@@ -667,7 +638,6 @@ class StagingDataIntegration:
raise
def run(self) -> pd.DataFrame:
"""Run staging integration process"""
self.metadata['start_time'] = datetime.now()
try:
@@ -703,7 +673,6 @@ class StagingDataIntegration:
save_etl_metadata(self.client, self.metadata)
# Summary
print(f" ✓ Staging Integration completed: {len(df_integrated):,} rows")
print(f" Duration : {self.metadata['duration_seconds']:.2f}s")
if 'source_breakdown' in validation:
@@ -725,7 +694,6 @@ class StagingDataIntegration:
print(f" - country max length : {schema_val['country_max_length']}/100")
print(f" - year_range max length : {schema_val['year_range_max_length']}/20")
print(f" - unit max length : {schema_val['unit_max_length']}/20")
print(f"\n Metadata → [AUDIT] etl_metadata")
return df_integrated
@@ -734,6 +702,7 @@ class StagingDataIntegration:
self.logger.error(f"Staging integration failed: {str(e)}")
raise
# MAIN EXECUTION
if __name__ == "__main__":
@@ -745,11 +714,9 @@ if __name__ == "__main__":
logger = setup_logging()
client = get_bigquery_client()
# ── FAO ──────────────────────────────────────────────────────────────────
print("\n[1/4] Loading FAO Food Security Data → RAW (Bronze)...")
fao_source = FAODataSource(client)
df_fao = fao_source.run()
print(f" ✓ raw_fao: {len(df_fao):,} rows")
print(f" Indicators : {df_fao['indicator'].nunique()}")
print(f" Countries : {df_fao['country'].nunique()}")
@@ -757,28 +724,23 @@ if __name__ == "__main__":
fao_indicators = df_fao['indicator'].unique()
# ── World Bank ────────────────────────────────────────────────────────────
print("\n[2/4] Loading World Bank Data → RAW (Bronze)...")
wb_source = WorldBankDataSource(client, list(fao_indicators))
df_wb = wb_source.run()
print(f" ✓ raw_worldbank: {len(df_wb):,} rows")
print(f" Matched indicators : {df_wb['indicator_fao'].nunique()}")
print(f" Countries : {df_wb['country'].nunique()}")
if len(df_wb) > 0:
print(f" Year range : {df_wb['year'].min()}{df_wb['year'].max()}")
# ── UNICEF ────────────────────────────────────────────────────────────────
print("\n[3/4] Loading UNICEF Data → RAW (Bronze)...")
unicef_source = UNICEFDataSource(client, list(fao_indicators))
df_unicef = unicef_source.run()
print(f" ✓ raw_unicef: {len(df_unicef):,} rows")
if len(df_unicef) > 0:
print(f" Matched indicators : {df_unicef['indicator_fao'].nunique()}")
print(f" Countries : {df_unicef['country'].nunique()}")
# ── Staging Integration ───────────────────────────────────────────────────
print("\n[4/4] Staging Integration → STAGING (Silver)...")
staging = StagingDataIntegration(client)
df_staging = staging.run()
@@ -789,7 +751,8 @@ if __name__ == "__main__":
print(f"STAGING (Silver) : staging_integrated")
print(f"AUDIT : etl_logs, etl_metadata")
print("=" * 60)
# AIRFLOW TASK FUNCTIONS
def run_verify_connection():