SDGS MDGS indicator
This commit is contained in:
@@ -8,7 +8,15 @@ Filtering Order:
|
||||
3. Filter complete indicators PER COUNTRY (auto-detect start year, no gaps)
|
||||
4. Filter countries with ALL pillars (FIXED SET)
|
||||
5. Filter indicators with consistent presence across FIXED countries
|
||||
6. Save analytical table (dengan nama/label lengkap untuk Looker Studio)
|
||||
6. Calculate YoY per indicator per country
|
||||
7. Save analytical table (dengan nama/label lengkap + kolom framework + YoY untuk Looker Studio)
|
||||
|
||||
UPDATED:
|
||||
- Kolom 'framework' (MDGs/SDGs) dipropagasi dari dim_indicator ke tabel output.
|
||||
Hal ini memungkinkan Looker Studio melakukan filter/slice berdasarkan framework
|
||||
tanpa perlu join ulang ke dim_indicator.
|
||||
- Kolom 'yoy_change' dan 'yoy_pct' ditambahkan untuk analisis Year-over-Year
|
||||
per indikator per negara langsung di Looker Studio.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
@@ -46,9 +54,17 @@ class AnalyticalLayerLoader:
|
||||
1. Complete per country (no gaps from start_year to end_year)
|
||||
2. Filter countries with all pillars
|
||||
3. Ensure indicators have consistent country count across all years
|
||||
4. Save dengan kolom lengkap (nama + ID) untuk kemudahan Looker Studio
|
||||
4. Calculate YoY (year-over-year) change per indicator per country
|
||||
5. Save dengan kolom lengkap (nama + ID + framework + YoY) untuk Looker Studio
|
||||
|
||||
Output: fact_asean_food_security_selected -> DW layer (Gold) -> fs_asean_gold
|
||||
|
||||
Kolom output:
|
||||
country_id, country_name,
|
||||
indicator_id, indicator_name, direction, framework,
|
||||
pillar_id, pillar_name,
|
||||
time_id, year, value,
|
||||
yoy_change, yoy_pct
|
||||
"""
|
||||
|
||||
def __init__(self, client: bigquery.Client):
|
||||
@@ -87,6 +103,7 @@ class AnalyticalLayerLoader:
|
||||
self.logger.info("=" * 80)
|
||||
|
||||
try:
|
||||
# Sertakan kolom framework dari dim_indicator dalam query
|
||||
query = f"""
|
||||
SELECT
|
||||
f.country_id,
|
||||
@@ -94,6 +111,7 @@ class AnalyticalLayerLoader:
|
||||
f.indicator_id,
|
||||
i.indicator_name,
|
||||
i.direction,
|
||||
i.framework,
|
||||
f.pillar_id,
|
||||
p.pillar_name,
|
||||
f.time_id,
|
||||
@@ -110,15 +128,34 @@ class AnalyticalLayerLoader:
|
||||
JOIN `{get_table_id('dim_time', layer='gold')}` t ON f.time_id = t.time_id
|
||||
"""
|
||||
|
||||
self.logger.info("Loading fact table with dimensions...")
|
||||
self.df_clean = self.client.query(query).result().to_dataframe(create_bqstorage_client=False)
|
||||
self.logger.info("Loading fact table with dimensions (incl. framework)...")
|
||||
self.df_clean = self.client.query(query).result().to_dataframe(
|
||||
create_bqstorage_client=False
|
||||
)
|
||||
self.logger.info(f" Loaded: {len(self.df_clean):,} rows")
|
||||
|
||||
if 'is_year_range' in self.df_clean.columns:
|
||||
yr = self.df_clean['is_year_range'].value_counts()
|
||||
self.logger.info(f" Breakdown:")
|
||||
self.logger.info(f" Single years (is_year_range=False): {yr.get(False, 0):,}")
|
||||
self.logger.info(f" Year ranges (is_year_range=True): {yr.get(True, 0):,}")
|
||||
self.logger.info(
|
||||
f" Single years (is_year_range=False): {yr.get(False, 0):,}"
|
||||
)
|
||||
self.logger.info(
|
||||
f" Year ranges (is_year_range=True): {yr.get(True, 0):,}"
|
||||
)
|
||||
|
||||
# Validasi kolom framework tersedia
|
||||
if 'framework' not in self.df_clean.columns:
|
||||
raise ValueError(
|
||||
"Kolom 'framework' tidak ditemukan di dim_indicator. "
|
||||
"Pastikan bigquery_cleaned_layer.py dan bigquery_dimensional_model.py "
|
||||
"sudah dijalankan dengan versi terbaru."
|
||||
)
|
||||
|
||||
fw_dist = self.df_clean.drop_duplicates('indicator_id')['framework'].value_counts()
|
||||
self.logger.info(f" Framework distribution (per indikator unik):")
|
||||
for fw, cnt in fw_dist.items():
|
||||
self.logger.info(f" {fw}: {cnt} indicators")
|
||||
|
||||
self.df_indicator = read_from_bigquery(self.client, 'dim_indicator', layer='gold')
|
||||
self.df_country = read_from_bigquery(self.client, 'dim_country', layer='gold')
|
||||
@@ -228,10 +265,15 @@ class AnalyticalLayerLoader:
|
||||
self.logger.info(f"\n [+] Valid: {len(valid_combinations):,}")
|
||||
self.logger.info(f" [-] Removed: {len(removed_combinations):,}")
|
||||
|
||||
df_valid = pd.DataFrame(valid_combinations)
|
||||
df_valid['key'] = df_valid['country_id'].astype(str) + '_' + df_valid['indicator_id'].astype(str)
|
||||
self.df_clean['key'] = (self.df_clean['country_id'].astype(str) + '_' +
|
||||
self.df_clean['indicator_id'].astype(str))
|
||||
df_valid = pd.DataFrame(valid_combinations)
|
||||
df_valid['key'] = (
|
||||
df_valid['country_id'].astype(str) + '_' +
|
||||
df_valid['indicator_id'].astype(str)
|
||||
)
|
||||
self.df_clean['key'] = (
|
||||
self.df_clean['country_id'].astype(str) + '_' +
|
||||
self.df_clean['indicator_id'].astype(str)
|
||||
)
|
||||
|
||||
original_count = len(self.df_clean)
|
||||
self.df_clean = self.df_clean[self.df_clean['key'].isin(df_valid['key'])].copy()
|
||||
@@ -265,13 +307,17 @@ class AnalyticalLayerLoader:
|
||||
f"{row['pillar_count']}/{total_pillars} pillars"
|
||||
)
|
||||
|
||||
selected_countries = country_pillar_count[country_pillar_count['pillar_count'] == total_pillars]
|
||||
selected_countries = country_pillar_count[
|
||||
country_pillar_count['pillar_count'] == total_pillars
|
||||
]
|
||||
self.selected_country_ids = selected_countries['country_id'].tolist()
|
||||
|
||||
self.logger.info(f"\n FIXED SET: {len(self.selected_country_ids)} countries")
|
||||
|
||||
original_count = len(self.df_clean)
|
||||
self.df_clean = self.df_clean[self.df_clean['country_id'].isin(self.selected_country_ids)].copy()
|
||||
self.df_clean = self.df_clean[
|
||||
self.df_clean['country_id'].isin(self.selected_country_ids)
|
||||
].copy()
|
||||
|
||||
self.logger.info(f" Rows before: {original_count:,}")
|
||||
self.logger.info(f" Rows after: {len(self.df_clean):,}")
|
||||
@@ -285,7 +331,9 @@ class AnalyticalLayerLoader:
|
||||
indicator_country_start = self.df_clean.groupby([
|
||||
'indicator_id', 'indicator_name', 'country_id'
|
||||
])['year'].min().reset_index()
|
||||
indicator_country_start.columns = ['indicator_id', 'indicator_name', 'country_id', 'start_year']
|
||||
indicator_country_start.columns = [
|
||||
'indicator_id', 'indicator_name', 'country_id', 'start_year'
|
||||
]
|
||||
|
||||
indicator_max_start = indicator_country_start.groupby([
|
||||
'indicator_id', 'indicator_name'
|
||||
@@ -324,7 +372,9 @@ class AnalyticalLayerLoader:
|
||||
else:
|
||||
removed_indicators.append({
|
||||
'indicator_name': indicator_name,
|
||||
'reason' : f"missing countries in years: {', '.join(problematic_years[:5])}"
|
||||
'reason' : (
|
||||
f"missing countries in years: {', '.join(problematic_years[:5])}"
|
||||
)
|
||||
})
|
||||
|
||||
self.logger.info(f"\n [+] Valid: {len(valid_indicators)}")
|
||||
@@ -334,12 +384,17 @@ class AnalyticalLayerLoader:
|
||||
raise ValueError("No valid indicators found after filtering!")
|
||||
|
||||
original_count = len(self.df_clean)
|
||||
self.df_clean = self.df_clean[self.df_clean['indicator_id'].isin(valid_indicators)].copy()
|
||||
self.df_clean = self.df_clean[
|
||||
self.df_clean['indicator_id'].isin(valid_indicators)
|
||||
].copy()
|
||||
|
||||
self.df_clean = self.df_clean.merge(
|
||||
indicator_max_start[['indicator_id', 'max_start_year']], on='indicator_id', how='left'
|
||||
indicator_max_start[['indicator_id', 'max_start_year']],
|
||||
on='indicator_id', how='left'
|
||||
)
|
||||
self.df_clean = self.df_clean[self.df_clean['year'] >= self.df_clean['max_start_year']].copy()
|
||||
self.df_clean = self.df_clean[
|
||||
self.df_clean['year'] >= self.df_clean['max_start_year']
|
||||
].copy()
|
||||
self.df_clean = self.df_clean.drop('max_start_year', axis=1)
|
||||
|
||||
self.logger.info(f"\n Rows before: {original_count:,}")
|
||||
@@ -355,12 +410,16 @@ class AnalyticalLayerLoader:
|
||||
self.logger.info("=" * 80)
|
||||
|
||||
expected_countries = len(self.selected_country_ids)
|
||||
verification = self.df_clean.groupby(['indicator_id', 'year'])['country_id'].nunique().reset_index()
|
||||
verification = self.df_clean.groupby(
|
||||
['indicator_id', 'year']
|
||||
)['country_id'].nunique().reset_index()
|
||||
verification.columns = ['indicator_id', 'year', 'country_count']
|
||||
all_good = (verification['country_count'] == expected_countries).all()
|
||||
|
||||
if all_good:
|
||||
self.logger.info(f" VERIFICATION PASSED — all combinations have {expected_countries} countries")
|
||||
self.logger.info(
|
||||
f" VERIFICATION PASSED — all combinations have {expected_countries} countries"
|
||||
)
|
||||
else:
|
||||
bad = verification[verification['country_count'] != expected_countries]
|
||||
for _, row in bad.head(10).iterrows():
|
||||
@@ -372,6 +431,101 @@ class AnalyticalLayerLoader:
|
||||
|
||||
return True
|
||||
|
||||
def calculate_yoy(self):
|
||||
"""
|
||||
Hitung Year-over-Year (YoY) per indikator per negara.
|
||||
|
||||
Kolom yang ditambahkan ke df_clean:
|
||||
yoy_change : selisih absolut -> value - value_tahun_sebelumnya
|
||||
yoy_pct : perubahan relatif -> (yoy_change / abs(value_prev)) * 100
|
||||
|
||||
Catatan:
|
||||
- Baris tahun pertama per kombinasi country-indicator akan bernilai NULL
|
||||
(tidak ada tahun sebelumnya sebagai pembanding) — ini intentional.
|
||||
- value_prev di-drop setelah kalkulasi, tidak ikut disimpan ke BigQuery.
|
||||
- Dilakukan SETELAH verify_no_gaps() agar data sudah clean dan sorted benar.
|
||||
"""
|
||||
self.logger.info("\n" + "=" * 80)
|
||||
self.logger.info("STEP 6b: CALCULATE YEAR-OVER-YEAR (YoY) PER INDICATOR PER COUNTRY")
|
||||
self.logger.info("=" * 80)
|
||||
|
||||
df = self.df_clean.sort_values(['country_id', 'indicator_id', 'year']).copy()
|
||||
|
||||
# Nilai tahun sebelumnya (shifted within each country-indicator group)
|
||||
df['value_prev'] = df.groupby(['country_id', 'indicator_id'])['value'].shift(1)
|
||||
|
||||
# YoY absolute change: value(t) - value(t-1)
|
||||
df['yoy_change'] = df['value'] - df['value_prev']
|
||||
|
||||
# YoY percentage change: (yoy_change / |value_prev|) * 100
|
||||
# Hindari division by zero — jika value_prev == 0 atau NaN, hasilnya NaN
|
||||
df['yoy_pct'] = np.where(
|
||||
df['value_prev'].notna() & (df['value_prev'] != 0),
|
||||
(df['yoy_change'] / df['value_prev'].abs()) * 100,
|
||||
np.nan
|
||||
)
|
||||
|
||||
# Drop kolom bantu value_prev, tidak ikut disimpan ke BigQuery
|
||||
df = df.drop(columns=['value_prev'])
|
||||
|
||||
# Log ringkasan
|
||||
total_rows = len(df)
|
||||
valid_yoy = df['yoy_pct'].notna().sum()
|
||||
null_yoy = df['yoy_pct'].isna().sum()
|
||||
|
||||
self.logger.info(f" Total rows : {total_rows:,}")
|
||||
self.logger.info(f" YoY calculated : {valid_yoy:,}")
|
||||
self.logger.info(f" YoY NULL (base yr): {null_yoy:,} <- tahun pertama per country-indicator")
|
||||
|
||||
# Log distribusi YoY per indikator (sample)
|
||||
per_ind = (
|
||||
df[df['yoy_pct'].notna()]
|
||||
.groupby(['indicator_id', 'indicator_name'])['yoy_pct']
|
||||
.agg(['mean', 'std', 'min', 'max'])
|
||||
.reset_index()
|
||||
)
|
||||
per_ind.columns = ['indicator_id', 'indicator_name', 'mean', 'std', 'min', 'max']
|
||||
|
||||
self.logger.info(f"\n YoY summary per indicator (top 10 by abs mean change):")
|
||||
self.logger.info(f" {'-'*100}")
|
||||
self.logger.info(
|
||||
f" {'ID':<5} {'Indicator Name':<52} {'Mean%':>8} {'Std%':>8} {'Min%':>8} {'Max%':>8}"
|
||||
)
|
||||
self.logger.info(f" {'-'*100}")
|
||||
|
||||
top_ind = per_ind.reindex(
|
||||
per_ind['mean'].abs().sort_values(ascending=False).index
|
||||
).head(10)
|
||||
|
||||
for _, row in top_ind.iterrows():
|
||||
self.logger.info(
|
||||
f" {int(row['indicator_id']):<5} {row['indicator_name'][:50]:<52} "
|
||||
f"{row['mean']:>+8.2f} {row['std']:>8.2f} "
|
||||
f"{row['min']:>+8.2f} {row['max']:>+8.2f}"
|
||||
)
|
||||
|
||||
# Log distribusi YoY per negara (ringkasan)
|
||||
per_country = (
|
||||
df[df['yoy_pct'].notna()]
|
||||
.groupby(['country_id', 'country_name'])['yoy_pct']
|
||||
.agg(['mean', 'std'])
|
||||
.reset_index()
|
||||
)
|
||||
per_country.columns = ['country_id', 'country_name', 'mean_yoy', 'std_yoy']
|
||||
|
||||
self.logger.info(f"\n YoY summary per country:")
|
||||
self.logger.info(f" {'-'*60}")
|
||||
self.logger.info(f" {'Country':<30} {'Mean YoY%':>10} {'Std YoY%':>10}")
|
||||
self.logger.info(f" {'-'*60}")
|
||||
for _, row in per_country.sort_values('mean_yoy', ascending=False).iterrows():
|
||||
self.logger.info(
|
||||
f" {row['country_name']:<30} {row['mean_yoy']:>+10.2f} {row['std_yoy']:>10.2f}"
|
||||
)
|
||||
|
||||
self.df_clean = df
|
||||
self.logger.info(f"\n [OK] YoY columns added: yoy_change, yoy_pct")
|
||||
return self.df_clean
|
||||
|
||||
def analyze_indicator_availability_by_year(self):
|
||||
self.logger.info("\n" + "=" * 80)
|
||||
self.logger.info("STEP 7: ANALYZE INDICATOR AVAILABILITY BY YEAR")
|
||||
@@ -394,39 +548,62 @@ class AnalyticalLayerLoader:
|
||||
)
|
||||
|
||||
indicator_details = self.df_clean.groupby([
|
||||
'indicator_id', 'indicator_name', 'pillar_name', 'direction'
|
||||
'indicator_id', 'indicator_name', 'pillar_name', 'direction', 'framework'
|
||||
]).agg({'year': ['min', 'max'], 'country_id': 'nunique'}).reset_index()
|
||||
indicator_details.columns = [
|
||||
'indicator_id', 'indicator_name', 'pillar_name', 'direction',
|
||||
'indicator_id', 'indicator_name', 'pillar_name', 'direction', 'framework',
|
||||
'start_year', 'end_year', 'country_count'
|
||||
]
|
||||
indicator_details['year_range'] = (
|
||||
indicator_details['start_year'].astype(int).astype(str) + '-' +
|
||||
indicator_details['end_year'].astype(int).astype(str)
|
||||
)
|
||||
indicator_details = indicator_details.sort_values(['pillar_name', 'start_year', 'indicator_name'])
|
||||
indicator_details = indicator_details.sort_values(
|
||||
['framework', 'pillar_name', 'start_year', 'indicator_name']
|
||||
)
|
||||
|
||||
self.logger.info(f"\nTotal Indicators: {len(indicator_details)}")
|
||||
for pillar, count in indicator_details.groupby('pillar_name').size().items():
|
||||
self.logger.info(f" {pillar}: {count} indicators")
|
||||
|
||||
self.logger.info(f"\n{'-'*100}")
|
||||
self.logger.info(f"{'ID':<5} {'Indicator Name':<55} {'Pillar':<15} {'Years':<12} {'Dir':<8} {'Countries'}")
|
||||
self.logger.info(f"{'-'*100}")
|
||||
self.logger.info(f"\nFramework breakdown:")
|
||||
for fw, count in indicator_details.groupby('framework').size().items():
|
||||
self.logger.info(f" {fw}: {count} indicators")
|
||||
|
||||
self.logger.info(f"\n{'-'*110}")
|
||||
self.logger.info(
|
||||
f"{'ID':<5} {'Indicator Name':<55} {'Pillar':<15} "
|
||||
f"{'Framework':<10} {'Years':<12} {'Dir':<8} {'Countries'}"
|
||||
)
|
||||
self.logger.info(f"{'-'*110}")
|
||||
for _, row in indicator_details.iterrows():
|
||||
direction = 'higher+' if row['direction'] == 'higher_better' else 'lower-'
|
||||
self.logger.info(
|
||||
f"{int(row['indicator_id']):<5} {row['indicator_name'][:52]:<55} "
|
||||
f"{row['pillar_name'][:13]:<15} {row['year_range']:<12} "
|
||||
f"{direction:<8} {int(row['country_count'])}"
|
||||
f"{row['pillar_name'][:13]:<15} {row['framework']:<10} "
|
||||
f"{row['year_range']:<12} {direction:<8} {int(row['country_count'])}"
|
||||
)
|
||||
|
||||
return year_stats
|
||||
|
||||
def save_analytical_table(self):
|
||||
# ---------------------------------------------------------------
|
||||
# CHANGED: nama tabel baru + kolom lengkap untuk Looker Studio
|
||||
# ---------------------------------------------------------------
|
||||
"""
|
||||
Simpan fact_asean_food_security_selected ke Gold layer.
|
||||
|
||||
Kolom yang disimpan:
|
||||
country_id, country_name — dimensi negara
|
||||
indicator_id, indicator_name — dimensi indikator
|
||||
direction — arah penilaian (higher/lower_better)
|
||||
framework — MDGs / SDGs (untuk filter Looker Studio)
|
||||
pillar_id, pillar_name — dimensi pilar
|
||||
time_id, year — dimensi waktu
|
||||
value — nilai indikator
|
||||
yoy_change — perubahan absolut YoY (NULLABLE: NULL di tahun pertama)
|
||||
yoy_pct — perubahan relatif YoY dalam % (NULLABLE: NULL di tahun pertama)
|
||||
|
||||
Kolom framework memungkinkan filter langsung di Looker Studio tanpa join ke dim_indicator.
|
||||
Kolom yoy_change dan yoy_pct memungkinkan analisis tren tahunan langsung di Looker Studio.
|
||||
"""
|
||||
table_name = 'fact_asean_food_security_selected'
|
||||
|
||||
self.logger.info("\n" + "=" * 80)
|
||||
@@ -434,22 +611,48 @@ class AnalyticalLayerLoader:
|
||||
self.logger.info("=" * 80)
|
||||
|
||||
try:
|
||||
# ------------------------------------------------------------------
|
||||
# Pilih kolom: ID + Nama lengkap + value
|
||||
# Kolom nama memudahkan filtering/slicing langsung di Looker Studio
|
||||
# tanpa perlu join ulang ke tabel dimensi.
|
||||
# ------------------------------------------------------------------
|
||||
# Pastikan kolom framework tersedia di df_clean
|
||||
if 'framework' not in self.df_clean.columns:
|
||||
self.logger.warning(
|
||||
" [WARN] Kolom 'framework' tidak ada di df_clean. "
|
||||
"Melakukan join ke dim_indicator sebagai fallback..."
|
||||
)
|
||||
dim_ind = read_from_bigquery(self.client, 'dim_indicator', layer='gold')
|
||||
if 'framework' in dim_ind.columns:
|
||||
self.df_clean = self.df_clean.merge(
|
||||
dim_ind[['indicator_id', 'framework']],
|
||||
on='indicator_id', how='left'
|
||||
)
|
||||
self.df_clean['framework'] = self.df_clean['framework'].fillna('MDGs')
|
||||
self.logger.info(" [OK] framework di-join dari dim_indicator")
|
||||
else:
|
||||
self.df_clean['framework'] = 'MDGs'
|
||||
self.logger.warning(
|
||||
" [WARN] dim_indicator juga tidak punya kolom framework. "
|
||||
"Default: MDGs. Jalankan ulang pipeline dari cleaned_layer."
|
||||
)
|
||||
|
||||
# Pastikan kolom YoY tersedia — fallback jika calculate_yoy() tidak dipanggil
|
||||
if 'yoy_change' not in self.df_clean.columns or 'yoy_pct' not in self.df_clean.columns:
|
||||
self.logger.warning(
|
||||
" [WARN] Kolom YoY tidak ditemukan. Menjalankan calculate_yoy() sebagai fallback..."
|
||||
)
|
||||
self.calculate_yoy()
|
||||
|
||||
analytical_df = self.df_clean[[
|
||||
'country_id',
|
||||
'country_name',
|
||||
'indicator_id',
|
||||
'indicator_name',
|
||||
'direction',
|
||||
'framework',
|
||||
'pillar_id',
|
||||
'pillar_name',
|
||||
'time_id',
|
||||
'year',
|
||||
'value',
|
||||
'yoy_change',
|
||||
'yoy_pct',
|
||||
]].copy()
|
||||
|
||||
analytical_df = analytical_df.sort_values(
|
||||
@@ -462,27 +665,46 @@ class AnalyticalLayerLoader:
|
||||
analytical_df['indicator_id'] = analytical_df['indicator_id'].astype(int)
|
||||
analytical_df['indicator_name']= analytical_df['indicator_name'].astype(str)
|
||||
analytical_df['direction'] = analytical_df['direction'].astype(str)
|
||||
analytical_df['framework'] = analytical_df['framework'].astype(str)
|
||||
analytical_df['pillar_id'] = analytical_df['pillar_id'].astype(int)
|
||||
analytical_df['pillar_name'] = analytical_df['pillar_name'].astype(str)
|
||||
analytical_df['time_id'] = analytical_df['time_id'].astype(int)
|
||||
analytical_df['year'] = analytical_df['year'].astype(int)
|
||||
analytical_df['value'] = analytical_df['value'].astype(float)
|
||||
# yoy_change dan yoy_pct tetap float — NULL (NaN) di tahun pertama adalah intentional
|
||||
analytical_df['yoy_change'] = analytical_df['yoy_change'].astype(float)
|
||||
analytical_df['yoy_pct'] = analytical_df['yoy_pct'].astype(float)
|
||||
|
||||
self.logger.info(f" Kolom yang disimpan: {list(analytical_df.columns)}")
|
||||
self.logger.info(f" Total rows: {len(analytical_df):,}")
|
||||
|
||||
# Schema BigQuery
|
||||
# Log distribusi framework
|
||||
fw_dist = analytical_df.drop_duplicates('indicator_id')['framework'].value_counts()
|
||||
self.logger.info(f" Framework distribution (per indikator unik):")
|
||||
for fw, cnt in fw_dist.items():
|
||||
self.logger.info(f" {fw}: {cnt} indicators")
|
||||
|
||||
# Log statistik YoY
|
||||
yoy_valid = analytical_df['yoy_pct'].notna().sum()
|
||||
yoy_null = analytical_df['yoy_pct'].isna().sum()
|
||||
self.logger.info(f" YoY rows (calculated): {yoy_valid:,}")
|
||||
self.logger.info(f" YoY rows (NULL/base) : {yoy_null:,}")
|
||||
|
||||
schema = [
|
||||
bigquery.SchemaField("country_id", "INTEGER", mode="REQUIRED"),
|
||||
bigquery.SchemaField("country_name", "STRING", mode="REQUIRED"),
|
||||
bigquery.SchemaField("indicator_id", "INTEGER", mode="REQUIRED"),
|
||||
bigquery.SchemaField("indicator_name", "STRING", mode="REQUIRED"),
|
||||
bigquery.SchemaField("direction", "STRING", mode="REQUIRED"),
|
||||
bigquery.SchemaField("framework", "STRING", mode="REQUIRED"),
|
||||
bigquery.SchemaField("pillar_id", "INTEGER", mode="REQUIRED"),
|
||||
bigquery.SchemaField("pillar_name", "STRING", mode="REQUIRED"),
|
||||
bigquery.SchemaField("time_id", "INTEGER", mode="REQUIRED"),
|
||||
bigquery.SchemaField("year", "INTEGER", mode="REQUIRED"),
|
||||
bigquery.SchemaField("value", "FLOAT", mode="REQUIRED"),
|
||||
# NULLABLE karena tahun pertama per country-indicator tidak memiliki nilai sebelumnya
|
||||
bigquery.SchemaField("yoy_change", "FLOAT", mode="NULLABLE"),
|
||||
bigquery.SchemaField("yoy_pct", "FLOAT", mode="NULLABLE"),
|
||||
]
|
||||
|
||||
rows_loaded = load_to_bigquery(
|
||||
@@ -508,17 +730,25 @@ class AnalyticalLayerLoader:
|
||||
'fixed_countries': len(self.selected_country_ids),
|
||||
'no_gaps' : True,
|
||||
'layer' : 'gold',
|
||||
'columns' : 'id + name + value (Looker Studio ready)'
|
||||
'columns' : (
|
||||
'id + name + direction + framework + value + '
|
||||
'yoy_change + yoy_pct (Looker Studio ready)'
|
||||
)
|
||||
}),
|
||||
'validation_metrics' : json.dumps({
|
||||
'fixed_countries' : len(self.selected_country_ids),
|
||||
'total_indicators': int(self.df_clean['indicator_id'].nunique())
|
||||
'total_indicators': int(self.df_clean['indicator_id'].nunique()),
|
||||
'framework_dist' : fw_dist.to_dict(),
|
||||
'yoy_rows_valid' : int(yoy_valid),
|
||||
'yoy_rows_null' : int(yoy_null),
|
||||
})
|
||||
}
|
||||
save_etl_metadata(self.client, metadata)
|
||||
|
||||
self.logger.info(f" ✓ {table_name}: {rows_loaded:,} rows → [DW/Gold] fs_asean_gold")
|
||||
self.logger.info(f" Metadata → [AUDIT] etl_metadata")
|
||||
self.logger.info(
|
||||
f" {table_name}: {rows_loaded:,} rows -> [DW/Gold] fs_asean_gold"
|
||||
)
|
||||
self.logger.info(f" Metadata -> [AUDIT] etl_metadata")
|
||||
return rows_loaded
|
||||
|
||||
except Exception as e:
|
||||
@@ -530,7 +760,9 @@ class AnalyticalLayerLoader:
|
||||
self.pipeline_metadata['start_time'] = self.pipeline_start
|
||||
|
||||
self.logger.info("\n" + "=" * 80)
|
||||
self.logger.info("Output: fact_asean_food_security_selected → fs_asean_gold")
|
||||
self.logger.info("Output: fact_asean_food_security_selected -> fs_asean_gold")
|
||||
self.logger.info("Kolom: country_id/name, indicator_id/name, direction, framework,")
|
||||
self.logger.info(" pillar_id/name, time_id, year, value, yoy_change, yoy_pct")
|
||||
self.logger.info("=" * 80)
|
||||
|
||||
self.load_source_data()
|
||||
@@ -539,6 +771,7 @@ class AnalyticalLayerLoader:
|
||||
self.select_countries_with_all_pillars()
|
||||
self.filter_indicators_consistent_across_fixed_countries()
|
||||
self.verify_no_gaps()
|
||||
self.calculate_yoy() # <-- Step 6b: hitung YoY
|
||||
self.analyze_indicator_availability_by_year()
|
||||
self.save_analytical_table()
|
||||
|
||||
@@ -577,7 +810,7 @@ def run_analytical_layer():
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("=" * 80)
|
||||
print("Output: fact_asean_food_security_selected → fs_asean_gold")
|
||||
print("Output: fact_asean_food_security_selected -> fs_asean_gold")
|
||||
print("=" * 80)
|
||||
|
||||
logger = setup_logging()
|
||||
|
||||
Reference in New Issue
Block a user