raw and staging data
This commit is contained in:
@@ -33,8 +33,8 @@ from pathlib import Path
|
||||
from difflib import SequenceMatcher
|
||||
import re
|
||||
|
||||
from bigquery_config import get_bigquery_client, CONFIG, EXPORTS_DIR, LOGS_DIR, get_table_id
|
||||
from bigquery_helpers import (
|
||||
from scripts.bigquery_config import get_bigquery_client, CONFIG, EXPORTS_DIR, LOGS_DIR, get_table_id
|
||||
from scripts.bigquery_helpers import (
|
||||
log_update,
|
||||
load_to_bigquery,
|
||||
read_from_bigquery,
|
||||
@@ -42,9 +42,10 @@ from bigquery_helpers import (
|
||||
save_etl_metadata,
|
||||
get_staging_schema
|
||||
)
|
||||
from bigquery_datasource import DataSource
|
||||
from scripts.bigquery_datasource import DataSource
|
||||
from google.cloud import bigquery
|
||||
|
||||
|
||||
# INDICATOR MATCHER
|
||||
|
||||
class IndicatorMatcher:
|
||||
@@ -200,7 +201,7 @@ class IndicatorMatcher:
|
||||
class FAODataSource(DataSource):
|
||||
"""
|
||||
FAO Food Security Data Source (BigQuery version)
|
||||
FIXED: Menggunakan bulk download karena faostat API butuh autentikasi
|
||||
Menggunakan bulk download karena faostat API butuh autentikasi
|
||||
"""
|
||||
|
||||
def __init__(self, client: bigquery.Client = None):
|
||||
@@ -447,28 +448,22 @@ class StagingDataIntegration:
|
||||
}
|
||||
|
||||
def load_raw_data(self) -> Dict[str, pd.DataFrame]:
|
||||
"""Load data dari semua tabel RAW layer (Bronze)"""
|
||||
raw_data = {}
|
||||
|
||||
try:
|
||||
raw_data['fao'] = read_from_bigquery(self.client, 'raw_fao', layer='bronze')
|
||||
except Exception:
|
||||
raw_data['fao'] = pd.DataFrame()
|
||||
|
||||
try:
|
||||
raw_data['worldbank'] = read_from_bigquery(self.client, 'raw_worldbank', layer='bronze')
|
||||
except Exception:
|
||||
raw_data['worldbank'] = pd.DataFrame()
|
||||
|
||||
try:
|
||||
raw_data['unicef'] = read_from_bigquery(self.client, 'raw_unicef', layer='bronze')
|
||||
except Exception:
|
||||
raw_data['unicef'] = pd.DataFrame()
|
||||
|
||||
return raw_data
|
||||
|
||||
def clean_value(self, value):
|
||||
"""Clean dan convert value ke float"""
|
||||
if pd.isna(value):
|
||||
return None
|
||||
value_str = str(value).strip().replace('<', '').replace('>', '').strip()
|
||||
@@ -478,18 +473,9 @@ class StagingDataIntegration:
|
||||
return None
|
||||
|
||||
def process_year_range(self, year_value):
|
||||
"""
|
||||
Process year range dan return (year_int, year_range_str)
|
||||
Examples:
|
||||
"2020" → (2020, "2020")
|
||||
"2020-2021" → (2020, "2020-2021")
|
||||
"2019–2021" → (2020, "2019-2021")
|
||||
"""
|
||||
if pd.isna(year_value):
|
||||
return None, None
|
||||
|
||||
year_str = str(year_value).strip().replace('–', '-').replace('—', '-')
|
||||
|
||||
if '-' in year_str:
|
||||
try:
|
||||
parts = year_str.split('-')
|
||||
@@ -509,7 +495,6 @@ class StagingDataIntegration:
|
||||
return None, year_str
|
||||
|
||||
def truncate_string(self, value, max_length: int) -> str:
|
||||
"""Truncate string sesuai varchar constraint"""
|
||||
if pd.isna(value):
|
||||
return ''
|
||||
s = str(value).strip()
|
||||
@@ -519,7 +504,6 @@ class StagingDataIntegration:
|
||||
indicator_orig_col: str, indicator_std_col: str,
|
||||
country_col: str, year_col: str, value_col: str,
|
||||
unit_col: str = None) -> pd.DataFrame:
|
||||
"""Standardize dataframe ke schema staging_integrated"""
|
||||
if df.empty:
|
||||
return pd.DataFrame()
|
||||
|
||||
@@ -543,10 +527,9 @@ class StagingDataIntegration:
|
||||
})
|
||||
|
||||
def standardize_schema(self, raw_data: Dict[str, pd.DataFrame]) -> pd.DataFrame:
|
||||
"""Standardize schema dari semua sumber data"""
|
||||
integrated_data = []
|
||||
|
||||
# FAO — deteksi kolom (nama asli atau sudah di-rename)
|
||||
# FAO
|
||||
if not raw_data['fao'].empty:
|
||||
df = raw_data['fao'].copy()
|
||||
integrated_data.append(self.standardize_dataframe(
|
||||
@@ -590,11 +573,9 @@ class StagingDataIntegration:
|
||||
|
||||
df_integrated = pd.concat(integrated_data, ignore_index=True)
|
||||
|
||||
# Final type conversion
|
||||
df_integrated['year'] = pd.to_numeric(df_integrated['year'], errors='coerce')
|
||||
df_integrated['value'] = pd.to_numeric(df_integrated['value'], errors='coerce')
|
||||
|
||||
# Enforce varchar constraints
|
||||
for col, max_len in [('source', 20), ('country', 100), ('indicator_original', 255),
|
||||
('indicator_standardized', 255), ('year_range', 20), ('unit', 20)]:
|
||||
df_integrated[col] = df_integrated[col].astype(str).apply(
|
||||
@@ -606,7 +587,6 @@ class StagingDataIntegration:
|
||||
).reset_index(drop=True)
|
||||
|
||||
def validate_data(self, df: pd.DataFrame) -> Dict:
|
||||
"""Validate data dan return metrics"""
|
||||
validation = {
|
||||
'total_rows' : int(len(df)),
|
||||
'total_columns' : int(len(df.columns)),
|
||||
@@ -621,15 +601,12 @@ class StagingDataIntegration:
|
||||
'max' : int(df['year'].max()) if not df['year'].isnull().all() else None,
|
||||
'unique_years': int(df['year'].nunique())
|
||||
}
|
||||
|
||||
if 'source' in df.columns:
|
||||
validation['source_breakdown'] = {
|
||||
str(k): int(v) for k, v in df['source'].value_counts().to_dict().items()
|
||||
}
|
||||
|
||||
if 'indicator_standardized' in df.columns:
|
||||
validation['unique_indicators'] = int(df['indicator_standardized'].nunique())
|
||||
|
||||
if 'country' in df.columns:
|
||||
validation['unique_countries'] = int(df['country'].nunique())
|
||||
|
||||
@@ -645,21 +622,15 @@ class StagingDataIntegration:
|
||||
return validation
|
||||
|
||||
def save_to_staging(self, df: pd.DataFrame):
|
||||
"""Save data ke staging_integrated table di STAGING layer (Silver)"""
|
||||
try:
|
||||
schema = get_staging_schema()
|
||||
|
||||
load_to_bigquery(
|
||||
self.client,
|
||||
df,
|
||||
self.staging_table,
|
||||
layer='silver', # → fs_asean_silver
|
||||
self.client, df, self.staging_table,
|
||||
layer='silver',
|
||||
write_disposition="WRITE_TRUNCATE",
|
||||
schema=schema
|
||||
)
|
||||
|
||||
log_update(self.client, 'STAGING', self.staging_table, 'full_refresh', len(df))
|
||||
|
||||
except Exception as e:
|
||||
print(f"save_to_staging FAILED: {type(e).__name__}: {e}")
|
||||
log_update(self.client, 'STAGING', self.staging_table, 'full_refresh', 0,
|
||||
@@ -667,7 +638,6 @@ class StagingDataIntegration:
|
||||
raise
|
||||
|
||||
def run(self) -> pd.DataFrame:
|
||||
"""Run staging integration process"""
|
||||
self.metadata['start_time'] = datetime.now()
|
||||
|
||||
try:
|
||||
@@ -703,7 +673,6 @@ class StagingDataIntegration:
|
||||
|
||||
save_etl_metadata(self.client, self.metadata)
|
||||
|
||||
# Summary
|
||||
print(f" ✓ Staging Integration completed: {len(df_integrated):,} rows")
|
||||
print(f" Duration : {self.metadata['duration_seconds']:.2f}s")
|
||||
if 'source_breakdown' in validation:
|
||||
@@ -725,7 +694,6 @@ class StagingDataIntegration:
|
||||
print(f" - country max length : {schema_val['country_max_length']}/100")
|
||||
print(f" - year_range max length : {schema_val['year_range_max_length']}/20")
|
||||
print(f" - unit max length : {schema_val['unit_max_length']}/20")
|
||||
|
||||
print(f"\n Metadata → [AUDIT] etl_metadata")
|
||||
|
||||
return df_integrated
|
||||
@@ -734,6 +702,7 @@ class StagingDataIntegration:
|
||||
self.logger.error(f"Staging integration failed: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
# MAIN EXECUTION
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -745,11 +714,9 @@ if __name__ == "__main__":
|
||||
logger = setup_logging()
|
||||
client = get_bigquery_client()
|
||||
|
||||
# ── FAO ──────────────────────────────────────────────────────────────────
|
||||
print("\n[1/4] Loading FAO Food Security Data → RAW (Bronze)...")
|
||||
fao_source = FAODataSource(client)
|
||||
df_fao = fao_source.run()
|
||||
|
||||
print(f" ✓ raw_fao: {len(df_fao):,} rows")
|
||||
print(f" Indicators : {df_fao['indicator'].nunique()}")
|
||||
print(f" Countries : {df_fao['country'].nunique()}")
|
||||
@@ -757,28 +724,23 @@ if __name__ == "__main__":
|
||||
|
||||
fao_indicators = df_fao['indicator'].unique()
|
||||
|
||||
# ── World Bank ────────────────────────────────────────────────────────────
|
||||
print("\n[2/4] Loading World Bank Data → RAW (Bronze)...")
|
||||
wb_source = WorldBankDataSource(client, list(fao_indicators))
|
||||
df_wb = wb_source.run()
|
||||
|
||||
print(f" ✓ raw_worldbank: {len(df_wb):,} rows")
|
||||
print(f" Matched indicators : {df_wb['indicator_fao'].nunique()}")
|
||||
print(f" Countries : {df_wb['country'].nunique()}")
|
||||
if len(df_wb) > 0:
|
||||
print(f" Year range : {df_wb['year'].min()}–{df_wb['year'].max()}")
|
||||
|
||||
# ── UNICEF ────────────────────────────────────────────────────────────────
|
||||
print("\n[3/4] Loading UNICEF Data → RAW (Bronze)...")
|
||||
unicef_source = UNICEFDataSource(client, list(fao_indicators))
|
||||
df_unicef = unicef_source.run()
|
||||
|
||||
print(f" ✓ raw_unicef: {len(df_unicef):,} rows")
|
||||
if len(df_unicef) > 0:
|
||||
print(f" Matched indicators : {df_unicef['indicator_fao'].nunique()}")
|
||||
print(f" Countries : {df_unicef['country'].nunique()}")
|
||||
|
||||
# ── Staging Integration ───────────────────────────────────────────────────
|
||||
print("\n[4/4] Staging Integration → STAGING (Silver)...")
|
||||
staging = StagingDataIntegration(client)
|
||||
df_staging = staging.run()
|
||||
@@ -789,7 +751,8 @@ if __name__ == "__main__":
|
||||
print(f"STAGING (Silver) : staging_integrated")
|
||||
print(f"AUDIT : etl_logs, etl_metadata")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
|
||||
# AIRFLOW TASK FUNCTIONS
|
||||
|
||||
def run_verify_connection():
|
||||
|
||||
Reference in New Issue
Block a user